Skip to content

similar_words

Similar words module.

Provides strategies for extracting similar words from a given word and filtering them.

BertSimilarWordsGenerator dataclass

Bases: SimilarWordsGenerator

Generate similar words using BERT.

Attributes:

Name Type Description
enrichment_text str

Text that will be used to find similar words.

bert_tokenizer Any

A BERT tokenizer. For example, BertTokenizer.from_pretrained("bert-base-uncased").

bert_model Any

A BERT model. For example, BertForMaskedLM.from_pretrained("bert-base-uncased").

Source code in src/sesg/similar_words/bert_strategy.py
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
@dataclass
class BertSimilarWordsGenerator(SimilarWordsGenerator):
    """Generate similar words using BERT.

    Attributes:
        enrichment_text (str): Text that will be used to find similar words.
        bert_tokenizer (Any): A BERT tokenizer. For example, `BertTokenizer.from_pretrained("bert-base-uncased")`.
        bert_model (Any): A BERT model. For example, `BertForMaskedLM.from_pretrained("bert-base-uncased")`.
    """  # noqa: E501

    @staticmethod
    def create_enrichment_text(
        studies_list: list[EnrichmentStudy],
    ) -> str:
        r"""Creates a piece of text that consists of the concatenation of the title and abstract of each study.

        Args:
            studies_list (list[EnrichmentStudy]): List of studies with title and abstract.

        Returns:
            The enrichment text.

        Examples:
            >>> studies = [
            ...     EnrichmentStudy(title="title1", abstract="abstract1"),
            ...     EnrichmentStudy(title="title2", abstract="abstract2 \r\ntext"),
            ...     EnrichmentStudy(title="title3", abstract="abstract3"),
            ... ]
            >>> BertSimilarWordsGenerator.create_enrichment_text(studies_list=studies)
            'title1 abstract1\ntitle2 abstract2 #.text\ntitle3 abstract3\n'
        """  # noqa: E501
        enrichment_text = ""
        for study in studies_list:
            title = study["title"]
            abstract = study["abstract"]

            line = f"{title} {abstract}".strip().replace("\r\n", "#.") + "\n"
            enrichment_text += line

        return enrichment_text

    enrichment_text: str
    bert_tokenizer: Any
    bert_model: Any

    def __call__(self, word: str) -> list[str]:
        """Generate similar words using BERT.

        Args:
            word (str): Word from which to find similar words.

        Returns:
            List of similar words.
        """
        if " " in word:
            return []

        selected_sentences: list[str] = []

        # Treatment for if the selected sentence is the last sentence of the text (return only one sentence).  # noqa: E501
        for sentence in self.enrichment_text.split("."):
            if word in sentence or word in sentence.lower():
                selected_sentences.append(sentence + ".")
                break

        formated_sentences = "[CLS] "
        for sentence in selected_sentences:
            formated_sentences += sentence.lower() + " [SEP] "

        tokenized_text = self.bert_tokenizer.tokenize(formated_sentences)

        # Defining the masked index equal to the word of the input.
        masked_index = 0
        word_is_in_tokens = False

        for count, token in enumerate(tokenized_text):
            if word in token.lower():
                masked_index = count
                tokenized_text[masked_index] = "[MASK]"

                word_is_in_tokens = True

        if not word_is_in_tokens:
            return []

        # Convert token to vocabulary indices.
        indexed_tokens = self.bert_tokenizer.convert_tokens_to_ids(tokenized_text)

        # Define sentence A and B indices associated to first and second sentences.
        len_first = tokenized_text.index("[SEP]")
        len_first = len_first + 1
        segments_ids = [0] * len_first + [1] * (len(tokenized_text) - len_first)

        # Convert the inputs to PyTorch tensors.
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])

        # Predict all tokens.
        with torch.no_grad():
            outputs = self.bert_model(tokens_tensor, token_type_ids=segments_tensors)
            predictions = outputs[0]

        # Get top thirty possibilities for the masked word.
        predicted_index = torch.topk(predictions[0, masked_index], 30)[1]
        predicted_index = list(np.array(predicted_index))

        # ???????????????????????????????????????
        # ???????????????????????????????????????
        # ???????????????????????????????????????
        #
        # # Remove the \2022 ascii error index.
        # for index in predicted_index:
        #     # doesn't make sense, since predicted_index has type `list[int]`
        #     if index == "1528":
        #         predicted_index.remove("1528")

        # for index in predicted_index:
        #     # what is wrong with token id 1000?
        #     # hard to track since the token may vary accordingly to the
        #     # `enrichment_text` and `word` params
        #     if index == 1000:
        #         predicted_index.remove(1000)
        #
        # ???????????????????????????????????????
        # ???????????????????????????????????????
        # ???????????????????????????????????????

        predicted_tokens: list[str] = self.bert_tokenizer.convert_ids_to_tokens(
            predicted_index
        )

        return [
            token for token in predicted_tokens if not check_is_bert_oov_word(token)
        ]

__call__(word)

Generate similar words using BERT.

Parameters:

Name Type Description Default
word str

Word from which to find similar words.

required

Returns:

Type Description
list[str]

List of similar words.

Source code in src/sesg/similar_words/bert_strategy.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
def __call__(self, word: str) -> list[str]:
    """Generate similar words using BERT.

    Args:
        word (str): Word from which to find similar words.

    Returns:
        List of similar words.
    """
    if " " in word:
        return []

    selected_sentences: list[str] = []

    # Treatment for if the selected sentence is the last sentence of the text (return only one sentence).  # noqa: E501
    for sentence in self.enrichment_text.split("."):
        if word in sentence or word in sentence.lower():
            selected_sentences.append(sentence + ".")
            break

    formated_sentences = "[CLS] "
    for sentence in selected_sentences:
        formated_sentences += sentence.lower() + " [SEP] "

    tokenized_text = self.bert_tokenizer.tokenize(formated_sentences)

    # Defining the masked index equal to the word of the input.
    masked_index = 0
    word_is_in_tokens = False

    for count, token in enumerate(tokenized_text):
        if word in token.lower():
            masked_index = count
            tokenized_text[masked_index] = "[MASK]"

            word_is_in_tokens = True

    if not word_is_in_tokens:
        return []

    # Convert token to vocabulary indices.
    indexed_tokens = self.bert_tokenizer.convert_tokens_to_ids(tokenized_text)

    # Define sentence A and B indices associated to first and second sentences.
    len_first = tokenized_text.index("[SEP]")
    len_first = len_first + 1
    segments_ids = [0] * len_first + [1] * (len(tokenized_text) - len_first)

    # Convert the inputs to PyTorch tensors.
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    # Predict all tokens.
    with torch.no_grad():
        outputs = self.bert_model(tokens_tensor, token_type_ids=segments_tensors)
        predictions = outputs[0]

    # Get top thirty possibilities for the masked word.
    predicted_index = torch.topk(predictions[0, masked_index], 30)[1]
    predicted_index = list(np.array(predicted_index))

    # ???????????????????????????????????????
    # ???????????????????????????????????????
    # ???????????????????????????????????????
    #
    # # Remove the \2022 ascii error index.
    # for index in predicted_index:
    #     # doesn't make sense, since predicted_index has type `list[int]`
    #     if index == "1528":
    #         predicted_index.remove("1528")

    # for index in predicted_index:
    #     # what is wrong with token id 1000?
    #     # hard to track since the token may vary accordingly to the
    #     # `enrichment_text` and `word` params
    #     if index == 1000:
    #         predicted_index.remove(1000)
    #
    # ???????????????????????????????????????
    # ???????????????????????????????????????
    # ???????????????????????????????????????

    predicted_tokens: list[str] = self.bert_tokenizer.convert_ids_to_tokens(
        predicted_index
    )

    return [
        token for token in predicted_tokens if not check_is_bert_oov_word(token)
    ]

create_enrichment_text(studies_list) staticmethod

Creates a piece of text that consists of the concatenation of the title and abstract of each study.

Parameters:

Name Type Description Default
studies_list list[EnrichmentStudy]

List of studies with title and abstract.

required

Returns:

Type Description
str

The enrichment text.

Examples:

>>> studies = [
...     EnrichmentStudy(title="title1", abstract="abstract1"),
...     EnrichmentStudy(title="title2", abstract="abstract2 \r\ntext"),
...     EnrichmentStudy(title="title3", abstract="abstract3"),
... ]
>>> BertSimilarWordsGenerator.create_enrichment_text(studies_list=studies)
'title1 abstract1\ntitle2 abstract2 #.text\ntitle3 abstract3\n'
Source code in src/sesg/similar_words/bert_strategy.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
@staticmethod
def create_enrichment_text(
    studies_list: list[EnrichmentStudy],
) -> str:
    r"""Creates a piece of text that consists of the concatenation of the title and abstract of each study.

    Args:
        studies_list (list[EnrichmentStudy]): List of studies with title and abstract.

    Returns:
        The enrichment text.

    Examples:
        >>> studies = [
        ...     EnrichmentStudy(title="title1", abstract="abstract1"),
        ...     EnrichmentStudy(title="title2", abstract="abstract2 \r\ntext"),
        ...     EnrichmentStudy(title="title3", abstract="abstract3"),
        ... ]
        >>> BertSimilarWordsGenerator.create_enrichment_text(studies_list=studies)
        'title1 abstract1\ntitle2 abstract2 #.text\ntitle3 abstract3\n'
    """  # noqa: E501
    enrichment_text = ""
    for study in studies_list:
        title = study["title"]
        abstract = study["abstract"]

        line = f"{title} {abstract}".strip().replace("\r\n", "#.") + "\n"
        enrichment_text += line

    return enrichment_text

SimilarWordsGenerator

Bases: Protocol

Protocol for similar a words generator.

Source code in src/sesg/similar_words/protocol.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
class SimilarWordsGenerator(Protocol):
    """Protocol for similar a words generator."""

    def __call__(self, word: str) -> list[str]:  # pragma: no cover
        """Interface of a function that generates similar words.

        Args:
            word (str): Word from which to find similar words.

        Returns:
            List of similar words.
        """
        raise NotImplementedError()

__call__(word)

Interface of a function that generates similar words.

Parameters:

Name Type Description Default
word str

Word from which to find similar words.

required

Returns:

Type Description
list[str]

List of similar words.

Source code in src/sesg/similar_words/protocol.py
 9
10
11
12
13
14
15
16
17
18
def __call__(self, word: str) -> list[str]:  # pragma: no cover
    """Interface of a function that generates similar words.

    Args:
        word (str): Word from which to find similar words.

    Returns:
        List of similar words.
    """
    raise NotImplementedError()

filter_with_stemming(word, *, similar_words_list)

Filters out similar words that are not relevant.

A similar word is kept on the list if it complies the following criteria:

Parameters:

Name Type Description Default
word str

Word that was used to generate the similar ones.

required
similar_words_list list[str]

List with the similar words.

required

Returns:

Type Description
list[str]

List of filtered similar words.

Source code in src/sesg/similar_words/stemming_filter.py
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
def filter_with_stemming(
    word: str,
    *,
    similar_words_list: list[str],
) -> list[str]:
    """Filters out similar words that are not relevant.

    A similar word is kept on the list if it complies the following criteria:

    - It is not a punctuation character (see [check_word_is_punctuation][sesg.similar_words.stemming_filter.check_word_is_punctuation]).
    - It's stemmed form is valid (see [check_stemmed_similar_word_is_valid][sesg.similar_words.stemming_filter.check_stemmed_similar_word_is_valid]).
    - It's stemmed form is not a duplicate (see [check_stemmed_similar_word_is_duplicate][sesg.similar_words.stemming_filter.check_stemmed_similar_word_is_duplicate]).

    Args:
        word (str): Word that was used to generate the similar ones.
        similar_words_list (list[str]): List with the similar words.

    Returns:
        List of filtered similar words.
    """  # noqa: E501
    stemmed_word: str = lancaster.stem(word)

    # list with the filtered similar words
    relevant_similar_words: list[str] = []

    # list with the filtered similar words, but stemmed
    stemmed_relevant_similar_words: list[str] = []

    for similar_word in similar_words_list:
        stemmed_similar_word = lancaster.stem(similar_word)

        similar_word_is_relevant = check_similar_word_is_relevant(
            similar_word,
            stemmed_word=stemmed_word,
            stemmed_similar_word=stemmed_similar_word,
            stemmed_relevant_similar_words=stemmed_relevant_similar_words,
        )

        if not similar_word_is_relevant:
            continue

        relevant_similar_words.append(similar_word)
        stemmed_relevant_similar_words.append(stemmed_similar_word)

    return relevant_similar_words