Skip to content

fuzzy_bsb

Fuzzy backward snowballing module.

Performs backward snowballing using fuzzy matching with rapidfuzz to perform string similarity checks.

FuzzyBackwardSnowballingStudy

Represents a study that will be included in backward snowballing.

The constructor will preprocess the title and text content to the correct format.

Examples:

>>> s = FuzzyBackwardSnowballingStudy(id=1, title=" title. HERE ", text_content=" text. \n \r\n HERE ")
>>> s.title == "titlehere", s.text_content == "texthere"
(True, True)
Source code in src/sesg/snowballing/fuzzy_bsb.py
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
class FuzzyBackwardSnowballingStudy:
    r"""Represents a study that will be included in backward snowballing.

    The constructor will preprocess the title and text content to the correct format.

    Examples:
        >>> s = FuzzyBackwardSnowballingStudy(id=1, title=" title. HERE ", text_content=" text. \n \r\n HERE ")
        >>> s.title == "titlehere", s.text_content == "texthere"
        (True, True)
    """  # noqa: E501

    __id: int
    __title: str
    __text_content: str

    def __init__(
        self,
        *,
        id: int,
        title: str,
        text_content: str,
    ) -> None:
        """Creates an instance of a SnowballingStudy.

        Args:
            id (int): Identifier of the study. Could be a database id, for example.
            title (str): Title of the study.
            text_content (str): Content of the study. Could be extracted from a PDF with CERMINE.
        """  # noqa: E501
        self.__id = id
        self.__title = preprocess_title(title)
        self.__text_content = preprocess_text(text_content)

    @property
    def id(self) -> int:
        """ID of the study."""
        return self.__id

    @property
    def title(self) -> str:
        """Title of the study."""
        return self.__title

    @property
    def text_content(self) -> str:
        """Text content of the study."""
        return self.__text_content

id: int property

ID of the study.

text_content: str property

Text content of the study.

title: str property

Title of the study.

__init__(*, id, title, text_content)

Creates an instance of a SnowballingStudy.

Parameters:

Name Type Description Default
id int

Identifier of the study. Could be a database id, for example.

required
title str

Title of the study.

required
text_content str

Content of the study. Could be extracted from a PDF with CERMINE.

required
Source code in src/sesg/snowballing/fuzzy_bsb.py
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
def __init__(
    self,
    *,
    id: int,
    title: str,
    text_content: str,
) -> None:
    """Creates an instance of a SnowballingStudy.

    Args:
        id (int): Identifier of the study. Could be a database id, for example.
        title (str): Title of the study.
        text_content (str): Content of the study. Could be extracted from a PDF with CERMINE.
    """  # noqa: E501
    self.__id = id
    self.__title = preprocess_title(title)
    self.__text_content = preprocess_text(text_content)

PooledTitleIsInTextArgs

Bases: TypedDict

Data container for the arguments of the pooled_study_cites_title function.

Attributes:

Name Type Description
title str

Title to search for

text str

Text of the study.

skip bool

Indicates if should skip the execution and return False.

Source code in src/sesg/snowballing/fuzzy_bsb.py
84
85
86
87
88
89
90
91
92
93
94
95
class PooledTitleIsInTextArgs(TypedDict):
    """Data container for the arguments of the [`pooled_study_cites_title`][sesg.snowballing.fuzzy_bsb.pooled_check_title_is_in_text] function.

    Attributes:
        title (str): Title to search for
        text (str): Text of the study.
        skip (bool): Indicates if should skip the execution and return False.
    """  # noqa: E501

    title: str
    text: str
    skip: bool

check_title_is_in_text(*, title, text)

Uses thefuzz.process.extractOne to determine if a title is in a piece of text.

Parameters:

Name Type Description Default
title str

Title to search for.

required
text str

Text of the study.

required

Returns:

Type Description
bool

True if the title is in the text, False otherwise.

Examples:

>>> check_title_is_in_text(
...     text="long text here very long REFERENCES: regression tests for machine learning models: a systematic literature review",
...     title="regression tests for machine learning models: a systematic literature review",
... )
True
Source code in src/sesg/snowballing/fuzzy_bsb.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def check_title_is_in_text(
    *,
    title: str,
    text: str,
) -> bool:
    """Uses `thefuzz.process.extractOne` to determine if a title is in a piece of text.

    Args:
        title (str): Title to search for.
        text (str): Text of the study.

    Returns:
        True if the title is in the text, False otherwise.

    Examples:
        >>> check_title_is_in_text(
        ...     text="long text here very long REFERENCES: regression tests for machine learning models: a systematic literature review",
        ...     title="regression tests for machine learning models: a systematic literature review",
        ... )
        True
    """  # noqa: E501
    window_size = len(title)
    options = ["".join(x) for x in window(text, size=window_size)]

    result = process.extractOne(title, options)

    if result is not None and result[1] >= 90:
        return True

    return False

fuzzy_backward_snowballing(studies)

Runs backward snowballing in the given list of studies.

Parameters:

Name Type Description Default
studies list[SnowballingStudy]

List of studies with id, title, and text content.

required

Yields:

Type Description
Iterator[tuple[FuzzyBackwardSnowballingStudy, list[FuzzyBackwardSnowballingStudy]]]

A tuple holding a study, and it's references.

Examples:

>>> studies: list[FuzzyBackwardSnowballingStudy] = [
...     FuzzyBackwardSnowballingStudy(id=1, title="title 1", text_content="... REFERENCES: machine learning, a SLR"),
...     FuzzyBackwardSnowballingStudy(id=2, title="machine learning, a SLR", text_content="... REFERENCES: other studies"),
... ]
>>>
>>> for study, references in fuzzy_backward_snowballing(studies):
...     print((study.id, [r.id for r in references]))
(1, [2])
(2, [])
Source code in src/sesg/snowballing/fuzzy_bsb.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
def fuzzy_backward_snowballing(
    studies: list[FuzzyBackwardSnowballingStudy],
) -> Iterator[
    tuple[FuzzyBackwardSnowballingStudy, list[FuzzyBackwardSnowballingStudy]]
]:
    """Runs backward snowballing in the given list of studies.

    Args:
        studies (list[SnowballingStudy]): List of studies with id, title, and text content.

    Yields:
        A tuple holding a study, and it's references.

    Examples:
        >>> studies: list[FuzzyBackwardSnowballingStudy] = [
        ...     FuzzyBackwardSnowballingStudy(id=1, title="title 1", text_content="... REFERENCES: machine learning, a SLR"),
        ...     FuzzyBackwardSnowballingStudy(id=2, title="machine learning, a SLR", text_content="... REFERENCES: other studies"),
        ... ]
        >>>
        >>> for study, references in fuzzy_backward_snowballing(studies):
        ...     print((study.id, [r.id for r in references]))
        (1, [2])
        (2, [])
    """  # noqa: E501
    for study_index, study in enumerate(studies):
        with Pool() as p:
            func_args: list[PooledTitleIsInTextArgs] = [
                {
                    # when `study_index == reference_index`, we are checking if study a cites itself,  # noqa: E501
                    # so we skip and set it as False
                    "skip": study_index == reference_index,
                    "text": study.text_content,
                    "title": reference.title,
                }
                for reference_index, reference in enumerate(studies)
            ]

            # if `is_cited_list[j]` is True then the current study cites title `j`
            is_cited_list: list[bool] = p.map(
                pooled_check_title_is_in_text,
                func_args,
            )

        references = [ref for ref, is_cited in zip(studies, is_cited_list) if is_cited]

        yield study, references

pooled_check_title_is_in_text(args)

Replicates check_title_is_in_text behaviour, with slight modifications to work well with multiprocessing.Pool.

Parameters:

Name Type Description Default
args PooledTitleIsInTextArgs

args of this function.

required

Returns:

Type Description
bool

False if skip is True, the result of check_title_is_in_text(args["title"], args["study"]) otherwise.

Examples:

>>> pooled_check_title_is_in_text(
...     {
...         "title": "regression tests for machine learning models: a systematic literature review",
...         "text": "TITLE: regression tests for machine learning models: a systematic literature review. Abstract: abstract here",
...         "skip": True,
...     }
... )
False
Source code in src/sesg/snowballing/fuzzy_bsb.py
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def pooled_check_title_is_in_text(
    args: PooledTitleIsInTextArgs,
) -> bool:
    """Replicates [`check_title_is_in_text`][sesg.snowballing.fuzzy_bsb.check_title_is_in_text] behaviour, with slight modifications to work well with `multiprocessing.Pool`.

    Args:
        args (PooledTitleIsInTextArgs): args of this function.

    Returns:
        False if skip is True, the result of `check_title_is_in_text(args["title"], args["study"])` otherwise.

    Examples:
        >>> pooled_check_title_is_in_text(
        ...     {
        ...         "title": "regression tests for machine learning models: a systematic literature review",
        ...         "text": "TITLE: regression tests for machine learning models: a systematic literature review. Abstract: abstract here",
        ...         "skip": True,
        ...     }
        ... )
        False
    """  # noqa: E501
    if args["skip"]:
        return False

    return check_title_is_in_text(text=args["text"], title=args["title"])

preprocess_text(text)

Processes the study in the following manner.

  1. Removes leading and trailing whitespaces
  2. Turns to lower case
  3. Removes line breaks, line carriages, spaces, and dots

Parameters:

Name Type Description Default
text str

Study's text to preprocess

required

Returns:

Type Description
str

Preprocessed text.

Examples:

>>> preprocess_text(" text. \n \r\n HERE ")
'texthere'
Source code in src/sesg/snowballing/fuzzy_bsb.py
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
def preprocess_text(
    text: str,
) -> str:
    r"""Processes the study in the following manner.

    1. Removes leading and trailing whitespaces
    1. Turns to lower case
    1. Removes line breaks, line carriages, spaces, and dots

    Args:
        text (str): Study's text to preprocess

    Returns:
        Preprocessed text.

    Examples:
        >>> preprocess_text(" text. \n \r\n HERE ")
        'texthere'
    """
    return (
        text.strip()
        .lower()
        .replace("\n", "")
        .replace("\r", "")
        .replace(" ", "")
        .replace(".", "")
    )

preprocess_title(title)

Processes the title in the following manner.

  1. Removes leading and trailing whitespaces
  2. Turns to lower case
  3. Removes spaces and dots

Parameters:

Name Type Description Default
title str

Title to preprocess.

required

Returns:

Type Description
str

Preprocessed title.

Examples:

>>> preprocess_title(" title. HERE ")
'titlehere'
Source code in src/sesg/snowballing/fuzzy_bsb.py
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
def preprocess_title(
    title: str,
) -> str:
    """Processes the title in the following manner.

    1. Removes leading and trailing whitespaces
    1. Turns to lower case
    1. Removes spaces and dots

    Args:
        title (str): Title to preprocess.

    Returns:
        Preprocessed title.

    Examples:
        >>> preprocess_title(" title. HERE ")
        'titlehere'
    """
    return title.strip().lower().replace(" ", "").replace(".", "")

window(seq, *, size)

Creates an iterator over overlapping subslices of the given size.

Parameters:

Name Type Description Default
seq Iterable[T]

Sequence to iterate over.

required
size int

Size of each subslice.

required

Yields:

Type Description
Iterator[tuple[T, ...]]

A subslice with the given size.

Examples:

>>> elements = [1, 2, 3, 4, 5, 6]
>>> for subslice in window(elements, size=3):
...     print(subslice)
(1, 2, 3)
(2, 3, 4)
(3, 4, 5)
(4, 5, 6)
Source code in src/sesg/snowballing/fuzzy_bsb.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def window(
    seq: Iterable[T],
    *,
    size: int,
) -> Iterator[tuple[T, ...]]:
    """Creates an iterator over overlapping subslices of the given size.

    Args:
        seq (Iterable[T]): Sequence to iterate over.
        size (int): Size of each subslice.

    Yields:
        A subslice with the given size.

    Examples:
        >>> elements = [1, 2, 3, 4, 5, 6]
        >>> for subslice in window(elements, size=3):
        ...     print(subslice)
        (1, 2, 3)
        (2, 3, 4)
        (3, 4, 5)
        (4, 5, 6)
    """
    it = iter(seq)
    result = tuple(islice(it, size))

    if len(result) == size:
        yield result

    for elem in it:
        result = result[1:] + (elem,)
        yield result