Skip to content

snowballing

Snowballing module.

Snowballing strategies to retrieve the citation graph of a set of studies.

FuzzyBackwardSnowballingStudy

Represents a study that will be included in backward snowballing.

The constructor will preprocess the title and text content to the correct format.

Examples:

>>> s = FuzzyBackwardSnowballingStudy(id=1, title=" title. HERE ", text_content=" text. \n \r\n HERE ")
>>> s.title == "titlehere", s.text_content == "texthere"
(True, True)
Source code in src/sesg/snowballing/fuzzy_bsb.py
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
class FuzzyBackwardSnowballingStudy:
    r"""Represents a study that will be included in backward snowballing.

    The constructor will preprocess the title and text content to the correct format.

    Examples:
        >>> s = FuzzyBackwardSnowballingStudy(id=1, title=" title. HERE ", text_content=" text. \n \r\n HERE ")
        >>> s.title == "titlehere", s.text_content == "texthere"
        (True, True)
    """  # noqa: E501

    __id: int
    __title: str
    __text_content: str

    def __init__(
        self,
        *,
        id: int,
        title: str,
        text_content: str,
    ) -> None:
        """Creates an instance of a SnowballingStudy.

        Args:
            id (int): Identifier of the study. Could be a database id, for example.
            title (str): Title of the study.
            text_content (str): Content of the study. Could be extracted from a PDF with CERMINE.
        """  # noqa: E501
        self.__id = id
        self.__title = preprocess_title(title)
        self.__text_content = preprocess_text(text_content)

    @property
    def id(self) -> int:
        """ID of the study."""
        return self.__id

    @property
    def title(self) -> str:
        """Title of the study."""
        return self.__title

    @property
    def text_content(self) -> str:
        """Text content of the study."""
        return self.__text_content

id: int property

ID of the study.

text_content: str property

Text content of the study.

title: str property

Title of the study.

__init__(*, id, title, text_content)

Creates an instance of a SnowballingStudy.

Parameters:

Name Type Description Default
id int

Identifier of the study. Could be a database id, for example.

required
title str

Title of the study.

required
text_content str

Content of the study. Could be extracted from a PDF with CERMINE.

required
Source code in src/sesg/snowballing/fuzzy_bsb.py
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
def __init__(
    self,
    *,
    id: int,
    title: str,
    text_content: str,
) -> None:
    """Creates an instance of a SnowballingStudy.

    Args:
        id (int): Identifier of the study. Could be a database id, for example.
        title (str): Title of the study.
        text_content (str): Content of the study. Could be extracted from a PDF with CERMINE.
    """  # noqa: E501
    self.__id = id
    self.__title = preprocess_title(title)
    self.__text_content = preprocess_text(text_content)

fuzzy_backward_snowballing(studies)

Runs backward snowballing in the given list of studies.

Parameters:

Name Type Description Default
studies list[SnowballingStudy]

List of studies with id, title, and text content.

required

Yields:

Type Description
Iterator[tuple[FuzzyBackwardSnowballingStudy, list[FuzzyBackwardSnowballingStudy]]]

A tuple holding a study, and it's references.

Examples:

>>> studies: list[FuzzyBackwardSnowballingStudy] = [
...     FuzzyBackwardSnowballingStudy(id=1, title="title 1", text_content="... REFERENCES: machine learning, a SLR"),
...     FuzzyBackwardSnowballingStudy(id=2, title="machine learning, a SLR", text_content="... REFERENCES: other studies"),
... ]
>>>
>>> for study, references in fuzzy_backward_snowballing(studies):
...     print((study.id, [r.id for r in references]))
(1, [2])
(2, [])
Source code in src/sesg/snowballing/fuzzy_bsb.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
def fuzzy_backward_snowballing(
    studies: list[FuzzyBackwardSnowballingStudy],
) -> Iterator[
    tuple[FuzzyBackwardSnowballingStudy, list[FuzzyBackwardSnowballingStudy]]
]:
    """Runs backward snowballing in the given list of studies.

    Args:
        studies (list[SnowballingStudy]): List of studies with id, title, and text content.

    Yields:
        A tuple holding a study, and it's references.

    Examples:
        >>> studies: list[FuzzyBackwardSnowballingStudy] = [
        ...     FuzzyBackwardSnowballingStudy(id=1, title="title 1", text_content="... REFERENCES: machine learning, a SLR"),
        ...     FuzzyBackwardSnowballingStudy(id=2, title="machine learning, a SLR", text_content="... REFERENCES: other studies"),
        ... ]
        >>>
        >>> for study, references in fuzzy_backward_snowballing(studies):
        ...     print((study.id, [r.id for r in references]))
        (1, [2])
        (2, [])
    """  # noqa: E501
    for study_index, study in enumerate(studies):
        with Pool() as p:
            func_args: list[PooledTitleIsInTextArgs] = [
                {
                    # when `study_index == reference_index`, we are checking if study a cites itself,  # noqa: E501
                    # so we skip and set it as False
                    "skip": study_index == reference_index,
                    "text": study.text_content,
                    "title": reference.title,
                }
                for reference_index, reference in enumerate(studies)
            ]

            # if `is_cited_list[j]` is True then the current study cites title `j`
            is_cited_list: list[bool] = p.map(
                pooled_check_title_is_in_text,
                func_args,
            )

        references = [ref for ref, is_cited in zip(studies, is_cited_list) if is_cited]

        yield study, references