similar_words

Similar words module.

Provides strategies for extracting similar words from a given word and filtering them.

`BertSimilarWordsGenerator` `dataclass`

Bases: SimilarWordsGenerator

Generate similar words using BERT.

Attributes:

Name	Type	Description
`enrichment_text`	`str`	Text that will be used to find similar words.
`bert_tokenizer`	`Any`	A BERT tokenizer. For example, `BertTokenizer.from_pretrained("bert-base-uncased")`.
`bert_model`	`Any`	A BERT model. For example, `BertForMaskedLM.from_pretrained("bert-base-uncased")`.

Source code in src/sesg/similar_words/bert_strategy.py

@dataclass
class BertSimilarWordsGenerator(SimilarWordsGenerator):
    """Generate similar words using BERT.

    Attributes:
        enrichment_text (str): Text that will be used to find similar words.
        bert_tokenizer (Any): A BERT tokenizer. For example, `BertTokenizer.from_pretrained("bert-base-uncased")`.
        bert_model (Any): A BERT model. For example, `BertForMaskedLM.from_pretrained("bert-base-uncased")`.
    """  # noqa: E501

    @staticmethod
    def create_enrichment_text(
        studies_list: list[EnrichmentStudy],
    ) -> str:
        r"""Creates a piece of text that consists of the concatenation of the title and abstract of each study.

        Args:
            studies_list (list[EnrichmentStudy]): List of studies with title and abstract.

        Returns:
            The enrichment text.

        Examples:
            >>> studies = [
            ...     EnrichmentStudy(title="title1", abstract="abstract1"),
            ...     EnrichmentStudy(title="title2", abstract="abstract2 \r\ntext"),
            ...     EnrichmentStudy(title="title3", abstract="abstract3"),
            ... ]
            >>> BertSimilarWordsGenerator.create_enrichment_text(studies_list=studies)
            'title1 abstract1\ntitle2 abstract2 #.text\ntitle3 abstract3\n'
        """  # noqa: E501
        enrichment_text = ""
        for study in studies_list:
            title = study["title"]
            abstract = study["abstract"]

            line = f"{title} {abstract}".strip().replace("\r\n", "#.") + "\n"
            enrichment_text += line

        return enrichment_text

    enrichment_text: str
    bert_tokenizer: Any
    bert_model: Any

    def __call__(self, word: str) -> list[str]:
        """Generate similar words using BERT.

        Args:
            word (str): Word from which to find similar words.

        Returns:
            List of similar words.
        """
        if " " in word:
            return []

        selected_sentences: list[str] = []

        # Treatment for if the selected sentence is the last sentence of the text (return only one sentence).  # noqa: E501
        for sentence in self.enrichment_text.split("."):
            if word in sentence or word in sentence.lower():
                selected_sentences.append(sentence + ".")
                break

        formated_sentences = "[CLS] "
        for sentence in selected_sentences:
            formated_sentences += sentence.lower() + " [SEP] "

        tokenized_text = self.bert_tokenizer.tokenize(formated_sentences)

        # Defining the masked index equal to the word of the input.
        masked_index = 0
        word_is_in_tokens = False

        for count, token in enumerate(tokenized_text):
            if word in token.lower():
                masked_index = count
                tokenized_text[masked_index] = "[MASK]"

                word_is_in_tokens = True

        if not word_is_in_tokens:
            return []

        # Convert token to vocabulary indices.
        indexed_tokens = self.bert_tokenizer.convert_tokens_to_ids(tokenized_text)

        # Define sentence A and B indices associated to first and second sentences.
        len_first = tokenized_text.index("[SEP]")
        len_first = len_first + 1
        segments_ids = [0] * len_first + [1] * (len(tokenized_text) - len_first)

        # Convert the inputs to PyTorch tensors.
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])

        # Predict all tokens.
        with torch.no_grad():
            outputs = self.bert_model(tokens_tensor, token_type_ids=segments_tensors)
            predictions = outputs[0]

        # Get top thirty possibilities for the masked word.
        predicted_index = torch.topk(predictions[0, masked_index], 30)[1]
        predicted_index = list(np.array(predicted_index))

        # ???????????????????????????????????????
        # ???????????????????????????????????????
        # ???????????????????????????????????????
        #
        # # Remove the \2022 ascii error index.
        # for index in predicted_index:
        #     # doesn't make sense, since predicted_index has type `list[int]`
        #     if index == "1528":
        #         predicted_index.remove("1528")

        # for index in predicted_index:
        #     # what is wrong with token id 1000?
        #     # hard to track since the token may vary accordingly to the
        #     # `enrichment_text` and `word` params
        #     if index == 1000:
        #         predicted_index.remove(1000)
        #
        # ???????????????????????????????????????
        # ???????????????????????????????????????
        # ???????????????????????????????????????

        predicted_tokens: list[str] = self.bert_tokenizer.convert_ids_to_tokens(
            predicted_index
        )

        return [
            token for token in predicted_tokens if not check_is_bert_oov_word(token)
        ]

`call(word)`

Generate similar words using BERT.

Parameters:

Name	Type	Description	Default
`word`	`str`	Word from which to find similar words.	required

Returns:

Type	Description
`list[str]`	List of similar words.

Source code in src/sesg/similar_words/bert_strategy.py

def __call__(self, word: str) -> list[str]:
    """Generate similar words using BERT.

    Args:
        word (str): Word from which to find similar words.

    Returns:
        List of similar words.
    """
    if " " in word:
        return []

    selected_sentences: list[str] = []

    # Treatment for if the selected sentence is the last sentence of the text (return only one sentence).  # noqa: E501
    for sentence in self.enrichment_text.split("."):
        if word in sentence or word in sentence.lower():
            selected_sentences.append(sentence + ".")
            break

    formated_sentences = "[CLS] "
    for sentence in selected_sentences:
        formated_sentences += sentence.lower() + " [SEP] "

    tokenized_text = self.bert_tokenizer.tokenize(formated_sentences)

    # Defining the masked index equal to the word of the input.
    masked_index = 0
    word_is_in_tokens = False

    for count, token in enumerate(tokenized_text):
        if word in token.lower():
            masked_index = count
            tokenized_text[masked_index] = "[MASK]"

            word_is_in_tokens = True

    if not word_is_in_tokens:
        return []

    # Convert token to vocabulary indices.
    indexed_tokens = self.bert_tokenizer.convert_tokens_to_ids(tokenized_text)

    # Define sentence A and B indices associated to first and second sentences.
    len_first = tokenized_text.index("[SEP]")
    len_first = len_first + 1
    segments_ids = [0] * len_first + [1] * (len(tokenized_text) - len_first)

    # Convert the inputs to PyTorch tensors.
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    # Predict all tokens.
    with torch.no_grad():
        outputs = self.bert_model(tokens_tensor, token_type_ids=segments_tensors)
        predictions = outputs[0]

    # Get top thirty possibilities for the masked word.
    predicted_index = torch.topk(predictions[0, masked_index], 30)[1]
    predicted_index = list(np.array(predicted_index))

    # ???????????????????????????????????????
    # ???????????????????????????????????????
    # ???????????????????????????????????????
    #
    # # Remove the \2022 ascii error index.
    # for index in predicted_index:
    #     # doesn't make sense, since predicted_index has type `list[int]`
    #     if index == "1528":
    #         predicted_index.remove("1528")

    # for index in predicted_index:
    #     # what is wrong with token id 1000?
    #     # hard to track since the token may vary accordingly to the
    #     # `enrichment_text` and `word` params
    #     if index == 1000:
    #         predicted_index.remove(1000)
    #
    # ???????????????????????????????????????
    # ???????????????????????????????????????
    # ???????????????????????????????????????

    predicted_tokens: list[str] = self.bert_tokenizer.convert_ids_to_tokens(
        predicted_index
    )

    return [
        token for token in predicted_tokens if not check_is_bert_oov_word(token)
    ]

`create_enrichment_text(studies_list)` `staticmethod`

Creates a piece of text that consists of the concatenation of the title and abstract of each study.

Parameters:

Name	Type	Description	Default
`studies_list`	`list[EnrichmentStudy]`	List of studies with title and abstract.	required

Returns:

Type	Description
`str`	The enrichment text.

Examples:

>>> studies = [
...     EnrichmentStudy(title="title1", abstract="abstract1"),
...     EnrichmentStudy(title="title2", abstract="abstract2 \r\ntext"),
...     EnrichmentStudy(title="title3", abstract="abstract3"),
... ]
>>> BertSimilarWordsGenerator.create_enrichment_text(studies_list=studies)
'title1 abstract1\ntitle2 abstract2 #.text\ntitle3 abstract3\n'

Source code in src/sesg/similar_words/bert_strategy.py

@staticmethod
def create_enrichment_text(
    studies_list: list[EnrichmentStudy],
) -> str:
    r"""Creates a piece of text that consists of the concatenation of the title and abstract of each study.

    Args:
        studies_list (list[EnrichmentStudy]): List of studies with title and abstract.

    Returns:
        The enrichment text.

    Examples:
        >>> studies = [
        ...     EnrichmentStudy(title="title1", abstract="abstract1"),
        ...     EnrichmentStudy(title="title2", abstract="abstract2 \r\ntext"),
        ...     EnrichmentStudy(title="title3", abstract="abstract3"),
        ... ]
        >>> BertSimilarWordsGenerator.create_enrichment_text(studies_list=studies)
        'title1 abstract1\ntitle2 abstract2 #.text\ntitle3 abstract3\n'
    """  # noqa: E501
    enrichment_text = ""
    for study in studies_list:
        title = study["title"]
        abstract = study["abstract"]

        line = f"{title} {abstract}".strip().replace("\r\n", "#.") + "\n"
        enrichment_text += line

    return enrichment_text

`SimilarWordsGenerator`

Bases: Protocol

Protocol for similar a words generator.

Source code in src/sesg/similar_words/protocol.py

class SimilarWordsGenerator(Protocol):
    """Protocol for similar a words generator."""

    def __call__(self, word: str) -> list[str]:  # pragma: no cover
        """Interface of a function that generates similar words.

        Args:
            word (str): Word from which to find similar words.

        Returns:
            List of similar words.
        """
        raise NotImplementedError()

`call(word)`

Interface of a function that generates similar words.

Parameters:

Name	Type	Description	Default
`word`	`str`	Word from which to find similar words.	required

Returns:

Type	Description
`list[str]`	List of similar words.

Source code in src/sesg/similar_words/protocol.py

def __call__(self, word: str) -> list[str]:  # pragma: no cover
    """Interface of a function that generates similar words.

    Args:
        word (str): Word from which to find similar words.

    Returns:
        List of similar words.
    """
    raise NotImplementedError()

`filter_with_stemming(word, *, similar_words_list)`

Filters out similar words that are not relevant.

A similar word is kept on the list if it complies the following criteria:

It is not a punctuation character (see check_word_is_punctuation).
It's stemmed form is valid (see check_stemmed_similar_word_is_valid).
It's stemmed form is not a duplicate (see check_stemmed_similar_word_is_duplicate).

Parameters:

Name	Type	Description	Default
`word`	`str`	Word that was used to generate the similar ones.	required
`similar_words_list`	`list[str]`	List with the similar words.	required

Returns:

Type	Description
`list[str]`	List of filtered similar words.

Source code in src/sesg/similar_words/stemming_filter.py

def filter_with_stemming(
    word: str,
    *,
    similar_words_list: list[str],
) -> list[str]:
    """Filters out similar words that are not relevant.

    A similar word is kept on the list if it complies the following criteria:

    - It is not a punctuation character (see [check_word_is_punctuation][sesg.similar_words.stemming_filter.check_word_is_punctuation]).
    - It's stemmed form is valid (see [check_stemmed_similar_word_is_valid][sesg.similar_words.stemming_filter.check_stemmed_similar_word_is_valid]).
    - It's stemmed form is not a duplicate (see [check_stemmed_similar_word_is_duplicate][sesg.similar_words.stemming_filter.check_stemmed_similar_word_is_duplicate]).

    Args:
        word (str): Word that was used to generate the similar ones.
        similar_words_list (list[str]): List with the similar words.

    Returns:
        List of filtered similar words.
    """  # noqa: E501
    stemmed_word: str = lancaster.stem(word)

    # list with the filtered similar words
    relevant_similar_words: list[str] = []

    # list with the filtered similar words, but stemmed
    stemmed_relevant_similar_words: list[str] = []

    for similar_word in similar_words_list:
        stemmed_similar_word = lancaster.stem(similar_word)

        similar_word_is_relevant = check_similar_word_is_relevant(
            similar_word,
            stemmed_word=stemmed_word,
            stemmed_similar_word=stemmed_similar_word,
            stemmed_relevant_similar_words=stemmed_relevant_similar_words,
        )

        if not similar_word_is_relevant:
            continue

        relevant_similar_words.append(similar_word)
        stemmed_relevant_similar_words.append(stemmed_similar_word)

    return relevant_similar_words

similar_words

BertSimilarWordsGenerator dataclass

__call__(word)

create_enrichment_text(studies_list) staticmethod

SimilarWordsGenerator

__call__(word)

filter_with_stemming(word, *, similar_words_list)

`BertSimilarWordsGenerator` `dataclass`

`call(word)`

`create_enrichment_text(studies_list)` `staticmethod`

`SimilarWordsGenerator`

`call(word)`

`filter_with_stemming(word, *, similar_words_list)`