Source code for stark_qa.tools.process_text

import string
import re
import codecs
from collections import Counter
from bs4 import BeautifulSoup
from nltk.corpus import wordnet
from langchain.text_splitter import RecursiveCharacterTextSplitter


[docs]def compact_text(text):
    """
    Compact the text by removing unnecessary spaces and punctuation issues.

    Args:
        text (str): Input text to be compacted.

    Returns:
        str: Compacted text.
    """
    text = text.replace("\n", ". ").replace("- ", "")
    text = text.replace(": .", ":").replace(":.", ":")
    text = text.replace("  ", " ")
    text = text.replace(".. ", ". ")
    return text


[docs]def remove_punctuation(text):
    """
    Remove all punctuation from the given text.

    Args:
        text (str): Input text from which punctuation will be removed.

    Returns:
        str: Text without punctuation.
    """
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text


[docs]def clean_data(item):
    """
    Clean the text data.

    Args:
        item (Union[str, list, dict]): An object that contains text data which is cleaned iteratively.

    Returns:
        The cleaned data in the same format as item.
    """
    if isinstance(item, str):
        item = ' '.join(BeautifulSoup(item, "lxml").text.split())
    elif isinstance(item, list):
        item = [clean_data(i) for i in item]
    elif isinstance(item, dict):
        item = {remove_punctuation(clean_data(k).lower()).replace(' ', '_'): clean_data(i) for k, i in item.items()}
    return item


[docs]def chunk_text(text, chunk_size):
    """
    Split text into chunks of specified size.

    Args:
        text (str): Input text to be chunked.
        chunk_size (int): Size of each chunk.

    Returns:
        list: List of text chunks.
    """
    custom_text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_size // 5,
        length_function=len
    )
    texts = custom_text_splitter.create_documents([text])
    chunks = [text.page_content for text in texts]
    return chunks


[docs]def clean_dict(dictionary, remove_values=['', 'nan']):
    """
    Clean the dictionary by removing specific values.

    Args:
        dictionary (dict): A dictionary to be cleaned.
        remove_values (list): List of values to remove from the dictionary.

    Returns:
        dict: Cleaned dictionary.
    """
    new_dict = {}
    for k, v in dictionary.items():
        if isinstance(v, dict):
            new_dict[k] = clean_dict(v, remove_values)
        elif str(v) in remove_values:
            pass
        else:
            new_dict[k] = v
    return new_dict


[docs]def normalize_answer(s):
    """
    Normalize text by removing punctuation, articles and extra whitespace, and lowercasing the text.

    Args:
        s (str): Input text to be normalized.

    Returns:
        str: Normalized text.
    """
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


[docs]def recall_score(prediction, ground_truth):
    """
    Calculate the recall score between prediction and ground truth.

    Args:
        prediction (str): Predicted text.
        ground_truth (str): Ground truth text.

    Returns:
        float: Recall score.
    """
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    recall = 1.0 * num_same / len(ground_truth_tokens)
    return recall


[docs]def f1_score(prediction, ground_truth):
    """
    Calculate the F1 score between prediction and ground truth.

    Args:
        prediction (str): Predicted text.
        ground_truth (str): Ground truth text.

    Returns:
        float: F1 score.
    """
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


[docs]def exact_match_score(prediction, ground_truth):
    """
    Calculate the exact match score between prediction and ground truth.

    Args:
        prediction (str): Predicted text.
        ground_truth (str): Ground truth text.

    Returns:
        float: Exact match score.
    """
    return float(normalize_answer(prediction) == normalize_answer(ground_truth))


# Pluralization and Synonym extraction

ABERRANT_PLURAL_MAP = {
    'appendix': 'appendices',
    'barracks': 'barracks',
    'cactus': 'cacti',
    'child': 'children',
    'criterion': 'criteria',
    'deer': 'deer',
    'echo': 'echoes',
    'elf': 'elves',
    'embargo': 'embargoes',
    'focus': 'foci',
    'fungus': 'fungi',
    'goose': 'geese',
    'hero': 'heroes',
    'hoof': 'hooves',
    'index': 'indices',
    'knife': 'knives',
    'leaf': 'leaves',
    'life': 'lives',
    'man': 'men',
    'mouse': 'mice',
    'nucleus': 'nuclei',
    'person': 'people',
    'phenomenon': 'phenomena',
    'potato': 'potatoes',
    'self': 'selves',
    'syllabus': 'syllabi',
    'tomato': 'tomatoes',
    'torpedo': 'torpedoes',
    'veto': 'vetoes',
    'woman': 'women',
}

VOWELS = set('aeiou')


[docs]def synonym_extractor(phrase):
    """
    Extract synonyms for a given phrase using WordNet.

    Args:
        phrase (str): Input phrase to find synonyms for.

    Returns:
        list: List of synonyms.
    """
    synonyms = []
    for syn in wordnet.synsets(phrase):
        if '.n.' in syn.name():
            for l in syn.lemmas():
                synonyms.append(l.name())
    return list(set(synonyms))


[docs]def pluralize(singular):
    """
    Return the plural form of a given lowercase singular word (English only).

    Args:
        singular (str): Singular word.

    Returns:
        str: Plural form of the word.
    """
    if not singular:
        return ''
    plural = ABERRANT_PLURAL_MAP.get(singular)
    if plural:
        return plural
    root = singular
    try:
        if singular[-1] == 'y' and singular[-2] not in VOWELS:
            root = singular[:-1]
            suffix = 'ies'
        elif singular[-1] == 's':
            if singular[-2] in VOWELS:
                if singular[-3:] == 'ius':
                    root = singular[:-2]
                    suffix = 'i'
                else:
                    root = singular[:-1]
                    suffix = 'ses'
            else:
                suffix = 'es'
        elif singular[-2:] in ('ch', 'sh'):
            suffix = 'es'
        else:
            suffix = 's'
    except IndexError:
        suffix = 's'
    plural = root + suffix
    return plural


[docs]def decode_escapes(s):
    """
    Decode escape sequences in a string.

    Args:
        s (str): Input string with escape sequences.

    Returns:
        str: Decoded string.
    """
    ESCAPE_SEQUENCE_RE = re.compile(r'''
        ( \\U........      # 8-digit hex escapes
        | \\u....          # 4-digit hex escapes
        | \\x..            # 2-digit hex escapes
        | \\[0-7]{1,3}     # Octal escapes
        | \\N\{[^}]+\}     # Unicode characters by name
        | \\[\\'"abfnrtv]  # Single-character escapes
        )''', re.UNICODE | re.VERBOSE)

    def decode_match(match):
        return codecs.decode(match.group(0), 'unicode-escape')

    return ESCAPE_SEQUENCE_RE.sub(decode_match, s)