Source code for dsresumatch.evaluate_keywords

import json
import warnings
from pathlib import Path

# load in baseline keywords

[docs]
def load_baseline_keywords():
    """Load baseline keywords from the JSON file.

    This function reads a JSON file containing baseline keywords organized by categories.
    It flattens the categories into a single list of keywords, converting them to lowercase
    for uniformity. This list can be used for evaluating resumes against a standard set of
    keywords relevant to data science. 

    Returns
    -------
    list of str
        A list of baseline keywords in lowercase, extracted from the JSON file. 

    Raises
    ------
    FileNotFoundError
        If the JSON file containing the baseline keywords cannot be found.
    json.JSONDecodeError
        If the JSON file is not properly formatted.
    """
    
    data_path = Path(__file__).parent / "data" / "baseline_keywords.json"
    with open(data_path, "r") as f:
        keywords_dict = json.load(f)
    
    # flatten all categories into a single list
    return [keyword.lower() for category in keywords_dict.values() for keyword in category]



[docs]
def evaluate_keywords(cleaned_text, keywords=None, use_only_supplied_keywords=False):
    """
    Evaluate the quality of a resume by comparing its content against a set of predefined 
    or user-supplied keywords.

    This function assesses whether the resume contains relevant keywords that match the criteria 
    for a "good data science resume." Users can provide their own keywords or combine them with a 
    default set of predefined keywords.

    Parameters
    ----------
    cleaned_text : str
        The cleaned text content of the resume.
    keywords : list of str, optional
        A list of keywords to compare against the resume content. If not provided, only the baseline 
        keywords will be used. If `use_only_supplied_keywords` is set to True without supplying keywords, 
        no keywords will be used, and the function will return an empty result.
    use_only_supplied_keywords : bool, optional
        A flag to determine whether to use only the supplied keywords or to combine them with a default 
        set of predefined keywords. Defaults to False.

    Returns
    -------
    list of str
        A list of keywords (from either the baseline or provided keywords) that do not appear 
        in the `cleaned_text`.

    Examples
    --------
    >>> evaluate_keywords("software development project management agile methodologies", ["software", "agile", "teamwork"])
    ['teamwork']

    >>> evaluate_keywords("data analysis machine learning statistical modeling", use_only_supplied_keywords=False)
    ['teamwork', 'communication']
    """
    # input validation: verify text and keywords are strings
    if not isinstance(cleaned_text, str):
        raise TypeError("cleaned_text must be a string")
    
    if keywords is not None and not all(isinstance(k, str) for k in keywords):
        raise TypeError("All keywords must be strings")
    
    # Check for empty text and warn user
    if not cleaned_text.strip():
        warnings.warn("The provided resume text is an empty string. Returning all baseline keywords as missing.", UserWarning)
    
    # Warn if user wants to use only supplied keywords but provides none
    if use_only_supplied_keywords and (keywords is None or len(keywords) == 0):
        warnings.warn("No keywords provided while use_only_supplied_keywords=True. Returning empty list.", UserWarning)
    
    # convert text to lowercase for case-insensitive matching
    cleaned_text = cleaned_text.lower()
    
    # initialize the set of keywords to check
    # this will avoid duplicates as well
    keywords_to_check = set()
    
    # handle the supplied keywords
    if keywords is not None:
        keywords_to_check.update(k.lower() for k in keywords)
    
    # add baseline keywords if needed
    if not use_only_supplied_keywords:
        keywords_to_check.update(load_baseline_keywords())
    
    # if no keywords to check (edge case: use_only_supplied_keywords=True but no keywords provided)
    if not keywords_to_check:
        return []
    
    # lastly find missing keywords
    missing_keywords = []
    for keyword in keywords_to_check:
        if keyword not in cleaned_text:
            missing_keywords.append(keyword)
    
    return missing_keywords