topic_extraction

Topic extraction strategies.

`DocStudy`

Bases: TypedDict

Data container for a study that will be used to generate a doc.

Attributes:

Name	Type	Description
`title`	`str`	Title of the study.
`abstract`	`str`	Abstract of the study.
`keywords`	`str`	Keywords of the study.

Examples:

>>> study: DocStudy = {
...     "title": "machine learning",
...     "abstract": "machine learning is often used in the industry with the goal of...",
...     "keywords": "machine learning, code smells, defect detection"
... }
>>> study
{'title': 'machine learning', 'abstract': 'machine learning is often used in the industry with the goal of...', 'keywords': 'machine learning, code smells, defect detection'}

Source code in src/sesg/topic_extraction/create_docs.py

class DocStudy(TypedDict):
    """Data container for a study that will be used to generate a doc.

    Attributes:
        title (str): Title of the study.
        abstract (str): Abstract of the study.
        keywords (str): Keywords of the study.

    Examples:
        >>> study: DocStudy = {
        ...     "title": "machine learning",
        ...     "abstract": "machine learning is often used in the industry with the goal of...",
        ...     "keywords": "machine learning, code smells, defect detection"
        ... }
        >>> study
        {'title': 'machine learning', 'abstract': 'machine learning is often used in the industry with the goal of...', 'keywords': 'machine learning, code smells, defect detection'}
    """  # noqa: E501

    title: str
    abstract: str
    keywords: str

`extract_topics_with_bertopic(docs, *, kmeans_n_clusters, umap_n_neighbors)`

Extracts topics from a list of documents using BERTopic.

Parameters:

Name	Type	Description	Default
`docs`	`list[str]`	List of documents.	required
`kmeans_n_clusters`	`int`	The number of clusters to form as well as the number of centroids to generate. This is equivalent to setting the number of topics.	required
`umap_n_neighbors`	`int`	Number of neighboring sample points used when making the manifold approximation. Increasing this value typically results in a more global view of the embedding structure whilst smaller values result in a more local view. Increasing this value often results in larger clusters being created.	required

Returns:

Type	Description
`list[list[str]]`	List of topics, where a topic is a list of words.

Examples:

>>> extract_topics_with_bertopic(
...     docs=["detecting code smells with machine learning", "code smells detection tools", "error detection in Java software with machine learning"],
... )
[["word1 topic1", "word2 topic1"], ["word1 topic2", "word2 topic2"]]

Source code in src/sesg/topic_extraction/bertopic_strategy.py

def extract_topics_with_bertopic(
    docs: list[str],
    *,
    kmeans_n_clusters: int,
    umap_n_neighbors: int,
) -> list[list[str]]:
    """Extracts topics from a list of documents using BERTopic.

    Args:
        docs (list[str]): List of documents.
        kmeans_n_clusters (int): The number of clusters to form as well as the number of centroids to generate. This is equivalent to setting the number of topics.
        umap_n_neighbors (int): Number of neighboring sample points used when making the manifold approximation. Increasing this value typically results in a more global view of the embedding structure whilst smaller values result in a more local view. Increasing this value often results in larger clusters being created.

    Returns:
        List of topics, where a topic is a list of words.

    Examples:
        >>> extract_topics_with_bertopic(  # doctest: +SKIP
        ...     docs=["detecting code smells with machine learning", "code smells detection tools", "error detection in Java software with machine learning"],
        ... )
        [["word1 topic1", "word2 topic1"], ["word1 topic2", "word2 topic2"]]
    """  # noqa: E501
    vectorizer_model = CountVectorizer(
        stop_words="english",
        ngram_range=(1, 3),
    )

    umap_model = UMAP(
        n_neighbors=umap_n_neighbors,
        # default values used in BERTopic initialization.
        n_components=5,
        min_dist=0.0,
        metric="cosine",
        low_memory=False,
    )

    cluster_model = KMeans(
        n_clusters=kmeans_n_clusters,
    )

    topic_model = BERTopic(
        language="english",
        verbose=False,
        hdbscan_model=cluster_model,  # type: ignore
        vectorizer_model=vectorizer_model,
        umap_model=umap_model,
    )

    topic_model.fit_transform(docs)

    # topic_model.get_topics() will return a Mapping where
    # the key is the index of the topic,
    # and the value is a list of tuples
    # the tuple is composed of a word (or token), and its score

    topics: list[list[str]] = [
        [word for word, _ in topic_group]  # type: ignore
        for topic_group in topic_model.get_topics().values()
    ]

    return topics

`extract_topics_with_lda(docs, *, min_document_frequency, n_topics)`

Extracts topics from a list of documents using LDA method.

Parameters:

Name	Type	Description	Default
`docs`	`list[str]`	List of documents.	required
`min_document_frequency`	`float`	CountVectorizer parameter - Minimum document frequency for the word to appear on the bag of words.	required
`n_topics`	`int`	LDA parameter - Number of topics to generate.	required

Returns:

Type	Description
`list[list[str]]`	list of topics, where a topic is a list of words.

Examples:

>>> extract_topics_with_lda(
...     docs=["detecting code smells with machine learning", "code smells detection tools", "error detection in Java software with machine learning"],
...     min_document_frequency=0.1,
...     n_topics=2,
... )
[["word1 topic1", "word2 topic1"], ["word1 topic2", "word2 topic2"]]

Source code in src/sesg/topic_extraction/lda_strategy.py

def extract_topics_with_lda(
    docs: list[str],
    *,
    min_document_frequency: float,
    n_topics: int,
) -> list[list[str]]:
    """Extracts topics from a list of documents using LDA method.

    Args:
        docs (list[str]): List of documents.
        min_document_frequency (float): CountVectorizer parameter - Minimum document frequency for the word to appear on the bag of words.
        n_topics (int): LDA parameter - Number of topics to generate.

    Returns:
        list of topics, where a topic is a list of words.

    Examples:
        >>> extract_topics_with_lda(  # doctest: +SKIP
        ...     docs=["detecting code smells with machine learning", "code smells detection tools", "error detection in Java software with machine learning"],
        ...     min_document_frequency=0.1,
        ...     n_topics=2,
        ... )
        [["word1 topic1", "word2 topic1"], ["word1 topic2", "word2 topic2"]]
    """  # noqa: E501
    vectorizer = CountVectorizer(
        min_df=min_document_frequency,
        max_df=1.0,
        ngram_range=(1, 3),
        max_features=None,
        stop_words="english",
    )

    tf = vectorizer.fit_transform(docs)

    # `feature_names` is a list with the vectorized words from the document.
    # meaning `feature_names[i]` is a token in the text.
    feature_names = vectorizer.get_feature_names_out()

    alpha = None
    beta = None
    learning = "batch"  # Batch or Online

    # Run the Latent Dirichlet Allocation (LDA) algorithm and train it.
    lda = LatentDirichletAllocation(
        n_components=n_topics,
        doc_topic_prior=alpha,
        topic_word_prior=beta,
        learning_method=learning,
        learning_decay=0.7,
        learning_offset=10.0,
        max_iter=5000,
        batch_size=128,
        evaluate_every=-1,
        total_samples=1000000.0,
        perp_tol=0.1,
        mean_change_tol=0.001,
        max_doc_update_iter=100,
        random_state=0,
    )

    lda.fit(tf)

    # `lda.components_` hold the entire list of topics found by LDA.
    # notice that for `lda.components_`, the topic is a list of indexes
    # where the index will map to a token (~word) in `feature_names`.
    # as an example, the next line gets all tokens of the first topic

    # first_topic = lda.components_[0]
    # topic_words = [feature_names[i] for i in first_topic]

    # `topic.argsort()` will return the indexes that would sort the topics,
    # in ascending order
    # since we want the most latent topics, we reverse the list with `[::-1]`

    topics: list[list[str]] = [
        [feature_names[i] for i in topic.argsort()[::-1]] for topic in lda.components_
    ]

    return topics