Skip to content

generation

Search string generation.

generate_search_string(topics, n_words_per_topic, *, n_similar_words_per_word=0, similar_words_generator=None)

Generates a search string that will be enriched with the desired number of similar words.

Parameters:

Name Type Description Default
topics list[list[str]]

List of topics to use.

required
n_words_per_topic int

Number of words to keep in each topic.

required
n_similar_words_per_word int

Number of similar words to generate for each word in each topic.

0
similar_words_generator SimilarWordsFinder

Instance of SimilarWordsFinder.

None

Returns:

Type Description
str

A search string.

Source code in src/sesg/search_string/generation.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def generate_search_string(
    topics: list[list[str]],
    n_words_per_topic: int,
    *,
    n_similar_words_per_word: int = 0,
    similar_words_generator: SimilarWordsGenerator | None = None,
) -> str:
    """Generates a search string that will be enriched with the desired number of similar words.

    Args:
        topics (list[list[str]]): List of topics to use.
        n_words_per_topic (int): Number of words to keep in each topic.
        n_similar_words_per_word (int): Number of similar words to generate for each word in each topic.
        similar_words_generator (SimilarWordsFinder): Instance of SimilarWordsFinder.

    Returns:
        A search string.
    """  # noqa: E501
    if n_similar_words_per_word == 0:
        return generate_search_string_without_similar_words(
            topics=topics,
            n_words_per_topic=n_words_per_topic,
        )

    if similar_words_generator is None:
        raise ValueError(
            "similar_words_generator must be provided if n_similar_words_per_word > 0"
        )

    return generate_search_string_with_similar_words(
        topics=topics,
        n_words_per_topic=n_words_per_topic,
        n_similar_words_per_word=n_similar_words_per_word,
        similar_words_generator=similar_words_generator,
    )

generate_search_string_with_similar_words(topics, n_words_per_topic, n_similar_words_per_word, similar_words_generator)

Generates a search string with the following steps.

  1. Reduces the number of words per topic.
  2. For each word in each topic, finds similar words with the given function.

Parameters:

Name Type Description Default
topics list[list[str]]

List of topics to use.

required
n_words_per_topic int

Number of words to keep in each topic.

required
n_similar_words_per_word int

Number of similar words to generate for each word in each topic.

required
similar_words_generator SimilarWordsGenerator

Instance of SimilarWordsGenerator.

required

Returns:

Type Description
str

The search string.

Source code in src/sesg/search_string/generation.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def generate_search_string_with_similar_words(
    topics: list[list[str]],
    n_words_per_topic: int,
    n_similar_words_per_word: int,
    similar_words_generator: SimilarWordsGenerator,
) -> str:
    """Generates a search string with the following steps.

    1. Reduces the number of words per topic.
    1. For each word in each topic, finds similar words with the given function.

    Args:
        topics (list[list[str]]): List of topics to use.
        n_words_per_topic (int): Number of words to keep in each topic.
        n_similar_words_per_word (int): Number of similar words to generate for each word in each topic.
        similar_words_generator (SimilarWordsGenerator): Instance of SimilarWordsGenerator.

    Returns:
        The search string.
    """  # noqa: E501
    topics_list = reduce_number_of_words_per_topic(
        topics=topics,
        n_words_per_topic=n_words_per_topic,
    )

    topics_with_similar_words: list[list[list[str]]] = []

    for topic in topics_list:
        topic_part: list[list[str]] = []
        for token in topic:
            similar_words = similar_words_generator(token)
            similar_words = filter_with_stemming(
                token,
                similar_words_list=similar_words,
            )

            # limiting the number of similar words
            # we add one because the word itself is included in the similar_words list
            word_with_similar_words = [token, *similar_words[:n_similar_words_per_word]]
            topic_part.append(word_with_similar_words)

        topics_with_similar_words.append(topic_part)

    string = join_topics_with_similar_words(topics_with_similar_words)

    return string

generate_search_string_without_similar_words(*, topics, n_words_per_topic)

Generates a search string by reducing the number of topics, and joining the reduced topics.

Words from the same topic are joined with AND, and topics are joined with OR.

Parameters:

Name Type Description Default
topics list[list[str]]

List of topics to use.

required
n_words_per_topic int

Number of words to keep in each topic.

required

Returns:

Type Description
str

The search string.

Source code in src/sesg/search_string/generation.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def generate_search_string_without_similar_words(
    *,
    topics: list[list[str]],
    n_words_per_topic: int,
) -> str:
    """Generates a search string by reducing the number of topics, and joining the reduced topics.

    Words from the same topic are joined with `AND`, and topics are joined with `OR`.

    Args:
        topics (list[list[str]]): List of topics to use.
        n_words_per_topic (int): Number of words to keep in each topic.

    Returns:
        The search string.
    """  # noqa: E501
    topics = reduce_number_of_words_per_topic(
        topics=topics,
        n_words_per_topic=n_words_per_topic,
    )

    string = join_topics_without_similar_words(topics)

    return string