Skip to content

formulation

Search string formulation utilities.

InvalidPubyearBoundariesError

Bases: ValueError

The provided pubyear boundaries are invalid.

Source code in src/sesg/search_string/formulation.py
6
7
class InvalidPubyearBoundariesError(ValueError):
    """The provided pubyear boundaries are invalid."""

join_tokens_with_operator(tokens, operator, *, use_double_quotes=False, use_parenthesis=False)

Joins the tokens in the list using the provided operator.

First checks if should surround with double quotes, then checks if should surround with parenthesis. If both are set to True, will add both double quotes and parenthesis.

Parameters:

Name Type Description Default
operator Literal['AND', 'OR']

Operator to use to join.

required
tokens Iterable[str]

Tokens to join.

required
use_double_quotes Optional[bool]

Whether to put double quotes surrounding each token.

False
use_parenthesis Optional[bool]

Whether to put parenthesis surrounding each token.

False

Returns:

Type Description
str

A string with the joined tokens.

Examples:

>>> join_tokens_with_operator(["machine", "learning", "SLR"], "AND", use_double_quotes=True)
'"machine" AND "learning" AND "SLR"'
Source code in src/sesg/search_string/formulation.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def join_tokens_with_operator(
    tokens: Iterable[str],
    operator: Literal["AND", "OR"],
    *,
    use_double_quotes: bool = False,
    use_parenthesis: bool = False,
) -> str:
    """Joins the tokens in the list using the provided operator.

    First checks if should surround with double quotes, then checks if should surround with parenthesis.
    If both are set to True, will add both double quotes and parenthesis.

    Args:
        operator (Literal["AND", "OR"]): Operator to use to join.
        tokens (Iterable[str]): Tokens to join.
        use_double_quotes (Optional[bool]): Whether to put double quotes surrounding each token.
        use_parenthesis (Optional[bool]): Whether to put parenthesis surrounding each token.

    Returns:
        A string with the joined tokens.

    Examples:
        >>> join_tokens_with_operator(["machine", "learning", "SLR"], "AND", use_double_quotes=True)
        '"machine" AND "learning" AND "SLR"'
    """  # noqa: E501
    if use_double_quotes:
        tokens = (f'"{token}"' for token in tokens)

    if use_parenthesis:
        tokens = (f"({token})" for token in tokens)

    return f" {operator} ".join(tokens)

join_topics_with_similar_words(topics)

Joins the topics in the list, creating a search string.

Specialization of sesg.search_string.formulation.join_tokens_with_operator to join a list of topics that includes similar words.

Each topic is a list of words that are considered similar.

Parameters:

Name Type Description Default
topics list[list[list[str]]]

List of topics to join.

required

Returns:

Type Description
str

A valid search string.

Examples:

>>> join_topics_with_similar_words([
...     [["machine", "computer"], ["learning", "knowledge"]],
...     [["code", "software"], ["smell", "defect"]]
... ])
'(("machine" OR "computer") AND ("learning" OR "knowledge")) OR (("code" OR "software") AND ("smell" OR "defect"))'
Source code in src/sesg/search_string/formulation.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
def join_topics_with_similar_words(
    topics: list[list[list[str]]],
) -> str:
    """Joins the topics in the list, creating a search string.

    Specialization of [sesg.search_string.formulation.join_tokens_with_operator][] to join a list of
    topics that includes similar words.

    Each topic is a list of words that are considered similar.

    Args:
        topics (list[list[list[str]]]): List of topics to join.

    Returns:
        A valid search string.

    Examples:
        >>> join_topics_with_similar_words([
        ...     [["machine", "computer"], ["learning", "knowledge"]],
        ...     [["code", "software"], ["smell", "defect"]]
        ... ])
        '(("machine" OR "computer") AND ("learning" OR "knowledge")) OR (("code" OR "software") AND ("smell" OR "defect"))'
    """  # noqa: E501
    topics_part: list[str] = []
    for topic in topics:
        similar_words_part: list[str] = []
        for similar_words in topic:
            # similar words are joined with OR
            s = join_tokens_with_operator(similar_words, "OR", use_double_quotes=True)
            similar_words_part.append(s)

        # sets of similar words are joined with AND
        s = join_tokens_with_operator(similar_words_part, "AND", use_parenthesis=True)
        topics_part.append(s)

    # topics are joined with OR
    string = join_tokens_with_operator(topics_part, "OR", use_parenthesis=True)

    return string

join_topics_without_similar_words(topics)

Joins the topics in the list, creating a search string.

Specialization of sesg.search_string.formulation.join_tokens_with_operator to join a list of topics that does not have similar words included.

Each topic is a list of words (or tokens).

Parameters:

Name Type Description Default
topics list[list[str]]

List of topics to join.

required

Returns:

Type Description
str

A valid search string.

Examples:

>>> join_topics_without_similar_words([["machine", "learning"], ["code", "smell"]])
'("machine" AND "learning") OR ("code" AND "smell")'
Source code in src/sesg/search_string/formulation.py
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def join_topics_without_similar_words(
    topics: list[list[str]],
) -> str:
    """Joins the topics in the list, creating a search string.

    Specialization of [sesg.search_string.formulation.join_tokens_with_operator][] to join a list of
    topics that does not have similar words included.

    Each topic is a list of words (or tokens).

    Args:
        topics (list[list[str]]): List of topics to join.

    Returns:
        A valid search string.

    Examples:
        >>> join_topics_without_similar_words([["machine", "learning"], ["code", "smell"]])
        '("machine" AND "learning") OR ("code" AND "smell")'
    """  # noqa: E501
    topics_part: list[str] = []
    for topic_words in topics:
        # words from the same topic are joined with AND
        s = join_tokens_with_operator(topic_words, "AND", use_double_quotes=True)
        topics_part.append(s)

    # topics are joined with OR
    string = join_tokens_with_operator(topics_part, "OR", use_parenthesis=True)

    return string

reduce_number_of_words_per_topic(topics, n_words_per_topic)

Reduces the number of words in each topic.

Parameters:

Name Type Description Default
topics list[list[str]]

List with the topics.

required
n_words_per_topic int

Number of words to keep in each topic.

required

Returns:

Type Description
list[list[str]]

List with the reduced topics.

Examples:

>>> reduce_number_of_words_per_topic([["machine", "learning"], ["code", "smell"]], 1)
[['machine'], ['code']]
Source code in src/sesg/search_string/formulation.py
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
def reduce_number_of_words_per_topic(
    topics: list[list[str]],
    n_words_per_topic: int,
) -> list[list[str]]:
    """Reduces the number of words in each topic.

    Args:
        topics (list[list[str]]): List with the topics.
        n_words_per_topic (int): Number of words to keep in each topic.

    Returns:
        List with the reduced topics.

    Examples:
        >>> reduce_number_of_words_per_topic([["machine", "learning"], ["code", "smell"]], 1)
        [['machine'], ['code']]
    """  # noqa: E501
    topics = [topic[:n_words_per_topic] for topic in topics]

    return topics

set_pub_year_boundaries(string, *, min_year=None, max_year=None)

Given a search string, will append PUBYEAR > and PUBYEAR < boundaries as needed.

Parameters:

Name Type Description Default
string str

A search string.

required
min_year int | None

Minimum year of publication. Defaults to None.

None
max_year int | None

Maximum year of publication. Defaults to None.

None

Returns:

Type Description
str

A search string with PUBYEAR boundaries.

Examples:

>>> set_pub_year_boundaries(string='title("machine" and "learning")', max_year=2018)
'title("machine" and "learning") AND PUBYEAR < 2018'
Source code in src/sesg/search_string/formulation.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def set_pub_year_boundaries(
    string: str,
    *,
    min_year: int | None = None,
    max_year: int | None = None,
) -> str:
    """Given a search string, will append `PUBYEAR >` and `PUBYEAR <` boundaries as needed.

    Args:
        string (str): A search string.
        min_year (int | None, optional): Minimum year of publication. Defaults to None.
        max_year (int | None, optional): Maximum year of publication. Defaults to None.

    Returns:
        A search string with PUBYEAR boundaries.

    Examples:
        >>> set_pub_year_boundaries(string='title("machine" and "learning")', max_year=2018)
        'title("machine" and "learning") AND PUBYEAR < 2018'
    """  # noqa: E501
    if min_year is not None and max_year is not None and min_year >= max_year:
        raise InvalidPubyearBoundariesError("Max year must be greater than min year")

    if min_year is not None:
        string += f" AND PUBYEAR > {min_year}"

    if max_year is not None:
        string += f" AND PUBYEAR < {max_year}"

    return string