evaluation

Evaluation module.

This module provides a method to evaluate the performance of a search string generated with SeSG.

`EvaluationFactory` `dataclass`

Evaluation factory.

To evaluate a search string, use the evaluate method.

Parameters:

Name	Type	Description	Default
`gs`	`list[Study]`	Gold standard.	required
`qgs`	`list[Study]`	Quasi gold standard.	required

Source code in src/sesg/evaluation/evaluation_factory.py

@dataclass(frozen=True)
class EvaluationFactory:
    """Evaluation factory.

    To evaluate a search string, use the [`evaluate`][sesg.evaluation.evaluation_factory.EvaluationFactory.evaluate] method.

    Args:
        gs (list[Study]): Gold standard.
        qgs (list[Study]): Quasi gold standard.
    """  # noqa: E501

    gs: list[Study]
    qgs: list[Study]

    @cached_property
    def processed_gs_titles(self) -> list[str]:
        """Preprocessed GS titles."""
        return [s.processed_title for s in self.gs]

    @cached_property
    def processed_qgs_titles(self) -> list[str]:
        """Preprocessed QGS titles."""
        return [s.processed_title for s in self.qgs]

    @cached_property
    def studies_dict(self) -> dict[int, Study]:
        """Dictionary mapping a study ID to a study."""
        return {s.id: s for s in self.gs}

    def _get_study_by_id(self, id: int) -> Study:
        return self.studies_dict[id]

    @cached_property
    def directed_adjacency_list(self) -> dict[int, list[int]]:
        """Directed adjacency list of the GS."""
        return get_directed_adjacency_list_from_gs(self.gs)

    @cached_property
    def undirected_adjacency_list(self) -> dict[int, list[int]]:
        """Undirected adjacency list of the GS."""
        return directed_adjacency_list_to_undirected(self.directed_adjacency_list)

    def get_qgs_in_scopus(
        self,
        processed_scopus_titles: list[str],
    ) -> list[Study]:
        """Get QGS studies that were found in Scopus."""
        qgs_in_scopus = similarity_score(
            small_set=self.processed_qgs_titles,
            other_set=processed_scopus_titles,
        )

        return [self.qgs[id] for id, _ in qgs_in_scopus]

    def get_gs_in_scopus(
        self,
        processed_scopus_titles: list[str],
    ) -> list[Study]:
        """Get GS studies that were found in Scopus."""
        gs_in_scopus = similarity_score(
            small_set=self.processed_gs_titles,
            other_set=processed_scopus_titles,
        )

        return [self.gs[id] for id, _ in gs_in_scopus]

    def get_gs_in_bsb(
        self,
        gs_in_scopus: list[Study],
    ) -> list[Study]:
        """Get GS studies that were found via backward snowballing."""
        gs_in_bsb = snowballing(
            adjacency_list=self.directed_adjacency_list,
            start_set=[s.id for s in gs_in_scopus],
        )

        return [self._get_study_by_id(id) for id in gs_in_bsb]

    def get_gs_in_sb(
        self,
        gs_in_scopus: list[Study],
    ) -> list[Study]:
        """Get GS studies that were found via backward or forward snowballing."""
        gs_in_bsb = snowballing(
            adjacency_list=self.undirected_adjacency_list,
            start_set=[s.id for s in gs_in_scopus],
        )

        return [self._get_study_by_id(id) for id in gs_in_bsb]

    def evaluate(
        self,
        scopus_results: list[str],
    ) -> Evaluation:
        """Evaluate the performance of a search string using the results returned by Scopus.

        Args:
            scopus_results (list[str]): List with the titles of the studies returned by Scopus.

        Returns:
            An object with the evaluation metrics.
        """  # noqa: E501
        processed_scopus_titles = [process_title(title) for title in scopus_results]

        qgs_in_scopus = self.get_qgs_in_scopus(processed_scopus_titles)
        gs_in_scopus = self.get_gs_in_scopus(processed_scopus_titles)
        gs_in_bsb = self.get_gs_in_bsb(gs_in_scopus)
        gs_in_sb = self.get_gs_in_sb(gs_in_scopus)

        return Evaluation(
            qgs_in_scopus=qgs_in_scopus,
            gs_in_scopus=gs_in_scopus,
            gs_in_bsb=gs_in_bsb,
            gs_in_sb=gs_in_sb,
            gs_size=len(self.gs),
            n_scopus_results=len(scopus_results),
        )

`directed_adjacency_list: dict[int, list[int]]` `property` `cached`

Directed adjacency list of the GS.

`processed_gs_titles: list[str]` `property` `cached`

Preprocessed GS titles.

`processed_qgs_titles: list[str]` `property` `cached`

Preprocessed QGS titles.

`studies_dict: dict[int, Study]` `property` `cached`

Dictionary mapping a study ID to a study.

`undirected_adjacency_list: dict[int, list[int]]` `property` `cached`

Undirected adjacency list of the GS.

`evaluate(scopus_results)`

Evaluate the performance of a search string using the results returned by Scopus.

Parameters:

Name	Type	Description	Default
`scopus_results`	`list[str]`	List with the titles of the studies returned by Scopus.	required

Returns:

Type	Description
`Evaluation`	An object with the evaluation metrics.

Source code in src/sesg/evaluation/evaluation_factory.py

def evaluate(
    self,
    scopus_results: list[str],
) -> Evaluation:
    """Evaluate the performance of a search string using the results returned by Scopus.

    Args:
        scopus_results (list[str]): List with the titles of the studies returned by Scopus.

    Returns:
        An object with the evaluation metrics.
    """  # noqa: E501
    processed_scopus_titles = [process_title(title) for title in scopus_results]

    qgs_in_scopus = self.get_qgs_in_scopus(processed_scopus_titles)
    gs_in_scopus = self.get_gs_in_scopus(processed_scopus_titles)
    gs_in_bsb = self.get_gs_in_bsb(gs_in_scopus)
    gs_in_sb = self.get_gs_in_sb(gs_in_scopus)

    return Evaluation(
        qgs_in_scopus=qgs_in_scopus,
        gs_in_scopus=gs_in_scopus,
        gs_in_bsb=gs_in_bsb,
        gs_in_sb=gs_in_sb,
        gs_size=len(self.gs),
        n_scopus_results=len(scopus_results),
    )

`get_gs_in_bsb(gs_in_scopus)`

Get GS studies that were found via backward snowballing.

Source code in src/sesg/evaluation/evaluation_factory.py

def get_gs_in_bsb(
    self,
    gs_in_scopus: list[Study],
) -> list[Study]:
    """Get GS studies that were found via backward snowballing."""
    gs_in_bsb = snowballing(
        adjacency_list=self.directed_adjacency_list,
        start_set=[s.id for s in gs_in_scopus],
    )

    return [self._get_study_by_id(id) for id in gs_in_bsb]

`get_gs_in_sb(gs_in_scopus)`

Get GS studies that were found via backward or forward snowballing.

Source code in src/sesg/evaluation/evaluation_factory.py

def get_gs_in_sb(
    self,
    gs_in_scopus: list[Study],
) -> list[Study]:
    """Get GS studies that were found via backward or forward snowballing."""
    gs_in_bsb = snowballing(
        adjacency_list=self.undirected_adjacency_list,
        start_set=[s.id for s in gs_in_scopus],
    )

    return [self._get_study_by_id(id) for id in gs_in_bsb]

`get_gs_in_scopus(processed_scopus_titles)`

Get GS studies that were found in Scopus.

Source code in src/sesg/evaluation/evaluation_factory.py

def get_gs_in_scopus(
    self,
    processed_scopus_titles: list[str],
) -> list[Study]:
    """Get GS studies that were found in Scopus."""
    gs_in_scopus = similarity_score(
        small_set=self.processed_gs_titles,
        other_set=processed_scopus_titles,
    )

    return [self.gs[id] for id, _ in gs_in_scopus]

`get_qgs_in_scopus(processed_scopus_titles)`

Get QGS studies that were found in Scopus.

Source code in src/sesg/evaluation/evaluation_factory.py

def get_qgs_in_scopus(
    self,
    processed_scopus_titles: list[str],
) -> list[Study]:
    """Get QGS studies that were found in Scopus."""
    qgs_in_scopus = similarity_score(
        small_set=self.processed_qgs_titles,
        other_set=processed_scopus_titles,
    )

    return [self.qgs[id] for id, _ in qgs_in_scopus]

`Study` `dataclass`

Represents a study.

Parameters:

Name	Type	Description	Default
`id`	`int`	Study's ID.	required
`title`	`str`	Study's title.	required
`references`	`list[Study]`	Study's references. If None, defaults to an empty list.	`field(default_factory=list)`

Source code in src/sesg/evaluation/evaluation_factory.py

@dataclass(unsafe_hash=True)
class Study:
    """Represents a study.

    Args:
        id (int): Study's ID.
        title (str): Study's title.
        references (list[Study]): Study's references. If None, defaults to an empty list.
    """  # noqa: E501

    id: int
    title: str

    references: list["Study"] = field(default_factory=list)

    @cached_property
    def processed_title(self):
        """Preprocessed title."""
        return process_title(self.title)

`processed_title` `property` `cached`

Preprocessed title.

`create_citation_graph(*, adjacency_list, studies_titles, start_set=None)`

Creates a graphviz.Digraph instance with the following properties.

Filled nodes: nodes on the start set.
Bold nodes: nodes found via snowballing on the start set.
Dashed nodes: nodes that are not on the start set, neither were found via snowballing.

Parameters:

Name	Type	Description	Default
`adjacency_list`	`dict[int, list[int]]`	A dict mapping a study ID to it's neighbors (citations/references).	required
`studies_titles`	`list[str]`	A dict mapping a study ID to it's title.	required
`start_set`	`Optional[list[int]]`	Start set. List of study IDs. If None, will default to an empty list.	`None`

Returns:

Type	Description
`Digraph`	A graphviz dot object with the said properties.

Examples:

>>> adjacency_list = {1: [2], 2: [3, 4], 3: [4, 5], 4: [6], 5: [7]}
>>> tooltips = {1: "Paper 1", 2: "Paper 2", 3: "Paper 3", 4: "Paper 4", 5: "Paper 5", 6: "Paper 6", 7: "Paper 7"}
>>> results_list = [1, 3]
>>> g = create_citation_graph(adjacency_list=adjacency_list, tooltips=tooltips, results_list=results_list)
>>> g.render(
...     filename="graph.dot",
...     directory="out",
...     format="pdf",
... )

Source code in src/sesg/evaluation/graph.py

def create_citation_graph(
    *,
    adjacency_list: dict[int, list[int]],
    studies_titles: dict[int, str],
    start_set: Optional[list[int]] = None,
) -> Digraph:
    """Creates a `graphviz.Digraph` instance with the following properties.

    - Filled nodes: nodes on the start set.
    - Bold nodes: nodes found via snowballing on the start set.
    - Dashed nodes: nodes that are not on the start set, neither were found via snowballing.

    Args:
        adjacency_list (dict[int, list[int]]): A dict mapping a study ID to it's neighbors (citations/references).
        studies_titles (list[str]): A dict mapping a study ID to it's title.
        start_set (Optional[list[int]]): Start set. List of study IDs. If None, will default to an empty list.

    Returns:
        A graphviz dot object with the said properties.

    Examples:
        >>> adjacency_list = {1: [2], 2: [3, 4], 3: [4, 5], 4: [6], 5: [7]}
        >>> tooltips = {1: "Paper 1", 2: "Paper 2", 3: "Paper 3", 4: "Paper 4", 5: "Paper 5", 6: "Paper 6", 7: "Paper 7"}
        >>> results_list = [1, 3]
        >>> g = create_citation_graph(adjacency_list=adjacency_list, tooltips=tooltips, results_list=results_list)  # doctest: +SKIP
        >>> g.render(  # doctest: +SKIP
        ...     filename="graph.dot",
        ...     directory="out",
        ...     format="pdf",
        ... )
    """  # noqa: E501
    if start_set is None:
        start_set = []

    graph = Digraph(strict=True)

    node_padding = len(str(len(studies_titles)))

    def format_node(node_id: int) -> str:
        return str(node_id).zfill(node_padding)

    # adding nodes
    # all nodes will be created as "not found" (with dashed style)
    for node, tooltip in studies_titles.items():
        graph.node(
            format_node(node),
            tooltip=tooltip,
            style="dashed",
        )

    # adding edges
    for node, neighbors in adjacency_list.items():
        for neighbor in neighbors:
            graph.edge(
                format_node(node),
                format_node(neighbor),
            )

    snowballing_nodes = snowballing(
        adjacency_list=adjacency_list,
        start_set=start_set,
    )

    # Since we care more about nodes that are on the start set,
    # first we mark the ones found via snowballing, and later,
    # the ones on the start set.
    # This way, if the same node appears both in the start set and via snowballing,
    # it will be marked as on the start set.

    # marking nodes that can be found via snowballing
    for node in snowballing_nodes:
        graph.node(
            format_node(node),
            shape="circle",
            style="bold",
            tooltip=studies_titles[node],
        )

    # marking nodes of the start set
    for node in start_set:
        graph.node(
            format_node(node),
            shape="circle",
            style="filled",
            tooltip=studies_titles[node],
        )

    return graph

evaluation

EvaluationFactory dataclass

directed_adjacency_list: dict[int, list[int]] property cached

processed_gs_titles: list[str] property cached

processed_qgs_titles: list[str] property cached

studies_dict: dict[int, Study] property cached

undirected_adjacency_list: dict[int, list[int]] property cached

evaluate(scopus_results)

get_gs_in_bsb(gs_in_scopus)

get_gs_in_sb(gs_in_scopus)

get_gs_in_scopus(processed_scopus_titles)

get_qgs_in_scopus(processed_scopus_titles)

Study dataclass

processed_title property cached

create_citation_graph(*, adjacency_list, studies_titles, start_set=None)

`EvaluationFactory` `dataclass`

`directed_adjacency_list: dict[int, list[int]]` `property` `cached`

`processed_gs_titles: list[str]` `property` `cached`

`processed_qgs_titles: list[str]` `property` `cached`

`studies_dict: dict[int, Study]` `property` `cached`

`undirected_adjacency_list: dict[int, list[int]]` `property` `cached`

`evaluate(scopus_results)`

`get_gs_in_bsb(gs_in_scopus)`

`get_gs_in_sb(gs_in_scopus)`

`get_gs_in_scopus(processed_scopus_titles)`

`get_qgs_in_scopus(processed_scopus_titles)`

`Study` `dataclass`

`processed_title` `property` `cached`

`create_citation_graph(*, adjacency_list, studies_titles, start_set=None)`