Skip to content

bert_strategy

Generate similar words using BERT.

BertSimilarWordsGenerator dataclass

Bases: SimilarWordsGenerator

Generate similar words using BERT.

Attributes:

Name Type Description
enrichment_text str

Text that will be used to find similar words.

bert_tokenizer Any

A BERT tokenizer. For example, BertTokenizer.from_pretrained("bert-base-uncased").

bert_model Any

A BERT model. For example, BertForMaskedLM.from_pretrained("bert-base-uncased").

Source code in src/sesg/similar_words/bert_strategy.py
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
@dataclass
class BertSimilarWordsGenerator(SimilarWordsGenerator):
    """Generate similar words using BERT.

    Attributes:
        enrichment_text (str): Text that will be used to find similar words.
        bert_tokenizer (Any): A BERT tokenizer. For example, `BertTokenizer.from_pretrained("bert-base-uncased")`.
        bert_model (Any): A BERT model. For example, `BertForMaskedLM.from_pretrained("bert-base-uncased")`.
    """  # noqa: E501

    @staticmethod
    def create_enrichment_text(
        studies_list: list[EnrichmentStudy],
    ) -> str:
        r"""Creates a piece of text that consists of the concatenation of the title and abstract of each study.

        Args:
            studies_list (list[EnrichmentStudy]): List of studies with title and abstract.

        Returns:
            The enrichment text.

        Examples:
            >>> studies = [
            ...     EnrichmentStudy(title="title1", abstract="abstract1"),
            ...     EnrichmentStudy(title="title2", abstract="abstract2 \r\ntext"),
            ...     EnrichmentStudy(title="title3", abstract="abstract3"),
            ... ]
            >>> BertSimilarWordsGenerator.create_enrichment_text(studies_list=studies)
            'title1 abstract1\ntitle2 abstract2 #.text\ntitle3 abstract3\n'
        """  # noqa: E501
        enrichment_text = ""
        for study in studies_list:
            title = study["title"]
            abstract = study["abstract"]

            line = f"{title} {abstract}".strip().replace("\r\n", "#.") + "\n"
            enrichment_text += line

        return enrichment_text

    enrichment_text: str
    bert_tokenizer: Any
    bert_model: Any

    def __call__(self, word: str) -> list[str]:
        """Generate similar words using BERT.

        Args:
            word (str): Word from which to find similar words.

        Returns:
            List of similar words.
        """
        if " " in word:
            return []

        selected_sentences: list[str] = []

        # Treatment for if the selected sentence is the last sentence of the text (return only one sentence).  # noqa: E501
        for sentence in self.enrichment_text.split("."):
            if word in sentence or word in sentence.lower():
                selected_sentences.append(sentence + ".")
                break

        formated_sentences = "[CLS] "
        for sentence in selected_sentences:
            formated_sentences += sentence.lower() + " [SEP] "

        tokenized_text = self.bert_tokenizer.tokenize(formated_sentences)

        # Defining the masked index equal to the word of the input.
        masked_index = 0
        word_is_in_tokens = False

        for count, token in enumerate(tokenized_text):
            if word in token.lower():
                masked_index = count
                tokenized_text[masked_index] = "[MASK]"

                word_is_in_tokens = True

        if not word_is_in_tokens:
            return []

        # Convert token to vocabulary indices.
        indexed_tokens = self.bert_tokenizer.convert_tokens_to_ids(tokenized_text)

        # Define sentence A and B indices associated to first and second sentences.
        len_first = tokenized_text.index("[SEP]")
        len_first = len_first + 1
        segments_ids = [0] * len_first + [1] * (len(tokenized_text) - len_first)

        # Convert the inputs to PyTorch tensors.
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])

        # Predict all tokens.
        with torch.no_grad():
            outputs = self.bert_model(tokens_tensor, token_type_ids=segments_tensors)
            predictions = outputs[0]

        # Get top thirty possibilities for the masked word.
        predicted_index = torch.topk(predictions[0, masked_index], 30)[1]
        predicted_index = list(np.array(predicted_index))

        # ???????????????????????????????????????
        # ???????????????????????????????????????
        # ???????????????????????????????????????
        #
        # # Remove the \2022 ascii error index.
        # for index in predicted_index:
        #     # doesn't make sense, since predicted_index has type `list[int]`
        #     if index == "1528":
        #         predicted_index.remove("1528")

        # for index in predicted_index:
        #     # what is wrong with token id 1000?
        #     # hard to track since the token may vary accordingly to the
        #     # `enrichment_text` and `word` params
        #     if index == 1000:
        #         predicted_index.remove(1000)
        #
        # ???????????????????????????????????????
        # ???????????????????????????????????????
        # ???????????????????????????????????????

        predicted_tokens: list[str] = self.bert_tokenizer.convert_ids_to_tokens(
            predicted_index
        )

        return [
            token for token in predicted_tokens if not check_is_bert_oov_word(token)
        ]

__call__(word)

Generate similar words using BERT.

Parameters:

Name Type Description Default
word str

Word from which to find similar words.

required

Returns:

Type Description
list[str]

List of similar words.

Source code in src/sesg/similar_words/bert_strategy.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
def __call__(self, word: str) -> list[str]:
    """Generate similar words using BERT.

    Args:
        word (str): Word from which to find similar words.

    Returns:
        List of similar words.
    """
    if " " in word:
        return []

    selected_sentences: list[str] = []

    # Treatment for if the selected sentence is the last sentence of the text (return only one sentence).  # noqa: E501
    for sentence in self.enrichment_text.split("."):
        if word in sentence or word in sentence.lower():
            selected_sentences.append(sentence + ".")
            break

    formated_sentences = "[CLS] "
    for sentence in selected_sentences:
        formated_sentences += sentence.lower() + " [SEP] "

    tokenized_text = self.bert_tokenizer.tokenize(formated_sentences)

    # Defining the masked index equal to the word of the input.
    masked_index = 0
    word_is_in_tokens = False

    for count, token in enumerate(tokenized_text):
        if word in token.lower():
            masked_index = count
            tokenized_text[masked_index] = "[MASK]"

            word_is_in_tokens = True

    if not word_is_in_tokens:
        return []

    # Convert token to vocabulary indices.
    indexed_tokens = self.bert_tokenizer.convert_tokens_to_ids(tokenized_text)

    # Define sentence A and B indices associated to first and second sentences.
    len_first = tokenized_text.index("[SEP]")
    len_first = len_first + 1
    segments_ids = [0] * len_first + [1] * (len(tokenized_text) - len_first)

    # Convert the inputs to PyTorch tensors.
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    # Predict all tokens.
    with torch.no_grad():
        outputs = self.bert_model(tokens_tensor, token_type_ids=segments_tensors)
        predictions = outputs[0]

    # Get top thirty possibilities for the masked word.
    predicted_index = torch.topk(predictions[0, masked_index], 30)[1]
    predicted_index = list(np.array(predicted_index))

    # ???????????????????????????????????????
    # ???????????????????????????????????????
    # ???????????????????????????????????????
    #
    # # Remove the \2022 ascii error index.
    # for index in predicted_index:
    #     # doesn't make sense, since predicted_index has type `list[int]`
    #     if index == "1528":
    #         predicted_index.remove("1528")

    # for index in predicted_index:
    #     # what is wrong with token id 1000?
    #     # hard to track since the token may vary accordingly to the
    #     # `enrichment_text` and `word` params
    #     if index == 1000:
    #         predicted_index.remove(1000)
    #
    # ???????????????????????????????????????
    # ???????????????????????????????????????
    # ???????????????????????????????????????

    predicted_tokens: list[str] = self.bert_tokenizer.convert_ids_to_tokens(
        predicted_index
    )

    return [
        token for token in predicted_tokens if not check_is_bert_oov_word(token)
    ]

create_enrichment_text(studies_list) staticmethod

Creates a piece of text that consists of the concatenation of the title and abstract of each study.

Parameters:

Name Type Description Default
studies_list list[EnrichmentStudy]

List of studies with title and abstract.

required

Returns:

Type Description
str

The enrichment text.

Examples:

>>> studies = [
...     EnrichmentStudy(title="title1", abstract="abstract1"),
...     EnrichmentStudy(title="title2", abstract="abstract2 \r\ntext"),
...     EnrichmentStudy(title="title3", abstract="abstract3"),
... ]
>>> BertSimilarWordsGenerator.create_enrichment_text(studies_list=studies)
'title1 abstract1\ntitle2 abstract2 #.text\ntitle3 abstract3\n'
Source code in src/sesg/similar_words/bert_strategy.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
@staticmethod
def create_enrichment_text(
    studies_list: list[EnrichmentStudy],
) -> str:
    r"""Creates a piece of text that consists of the concatenation of the title and abstract of each study.

    Args:
        studies_list (list[EnrichmentStudy]): List of studies with title and abstract.

    Returns:
        The enrichment text.

    Examples:
        >>> studies = [
        ...     EnrichmentStudy(title="title1", abstract="abstract1"),
        ...     EnrichmentStudy(title="title2", abstract="abstract2 \r\ntext"),
        ...     EnrichmentStudy(title="title3", abstract="abstract3"),
        ... ]
        >>> BertSimilarWordsGenerator.create_enrichment_text(studies_list=studies)
        'title1 abstract1\ntitle2 abstract2 #.text\ntitle3 abstract3\n'
    """  # noqa: E501
    enrichment_text = ""
    for study in studies_list:
        title = study["title"]
        abstract = study["abstract"]

        line = f"{title} {abstract}".strip().replace("\r\n", "#.") + "\n"
        enrichment_text += line

    return enrichment_text

EnrichmentStudy

Bases: TypedDict

Data container for a study that will be used to generate an enrichment text.

Attributes:

Name Type Description
title str

Title of the study.

abstract str

Abstract of the study.

Examples:

>>> study: EnrichmentStudy = {
...     "title": "machine learning",
...     "abstract": "machine learning is often used in the industry with the goal of...",
... }
>>> study
{'title': 'machine learning', 'abstract': 'machine learning is often used in the industry with the goal of...'}
Source code in src/sesg/similar_words/bert_strategy.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
class EnrichmentStudy(TypedDict):
    """Data container for a study that will be used to generate an enrichment text.

    Attributes:
        title (str): Title of the study.
        abstract (str): Abstract of the study.

    Examples:
        >>> study: EnrichmentStudy = {
        ...     "title": "machine learning",
        ...     "abstract": "machine learning is often used in the industry with the goal of...",
        ... }
        >>> study
        {'title': 'machine learning', 'abstract': 'machine learning is often used in the industry with the goal of...'}
    """  # noqa: E501

    title: str
    abstract: str

check_is_bert_oov_word(word)

Checks if the given word is a BERT out-of-vocabulary (OOV) word.

BERT represents OOV words as a string that starts with ##.

Parameters:

Name Type Description Default
word str

Word to check.

required

Returns:

Type Description
bool

True if it is an OOV word, False otherwise.

Examples:

>>> check_is_bert_oov_word("organization")
False
>>> check_is_bert_oov_word("##ation")
True
Source code in src/sesg/similar_words/bert_strategy.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def check_is_bert_oov_word(
    word: str,
) -> bool:
    """Checks if the given word is a BERT out-of-vocabulary (OOV) word.

    BERT represents OOV words as a string that starts with `##`.

    Args:
        word (str): Word to check.

    Returns:
        True if it is an OOV word, False otherwise.

    Examples:
        >>> check_is_bert_oov_word("organization")
        False
        >>> check_is_bert_oov_word("##ation")
        True
    """
    return word.startswith("##")