Source code for dsresumatch.pdf_cv_processing

import PyPDF2 # type: ignore
import string
from collections import Counter
from nltk.corpus import stopwords # type: ignore
import nltk # type: ignore

# Ensure NLTK stopwords are downloaded
nltk.download('stopwords')

[docs] def read_pdf(file_path): """ Extract text content from a PDF file and return it as a single consolidated string. Parameters ---------- file_path : str Path to the PDF file. Returns ------- str PDF file contents as text. Examples -------- >>> read_pdf("cv.pdf") 'Work Experience\nSoftware Developer at XYZ Corp.\nEducation\nBachelor of Science in Computer Science\n' """ if not isinstance(file_path, str): raise TypeError("file_path must be a string representing the file path") if not file_path.lower().endswith(".pdf"): raise ValueError("file_path must point to a PDF file") text_content = [] try: with open(file_path, 'rb') as pdf_file: reader = PyPDF2.PdfReader(pdf_file) for page in reader.pages: text_content.append(page.extract_text()) return "".join(text_content).replace("\n", "") except FileNotFoundError as e: raise FileNotFoundError(f"The file {file_path} does not exist.") from e except Exception as e: raise ValueError(f"Error reading the PDF file: {e}")
[docs] def clean_text(raw_text): """ Convert raw_text to lowercase, remove punctuation, and filter out common English stop words to retain only meaningful words in the string. Parameters ---------- raw_text : str Text to clean. Returns ------- str Cleaned text. Examples -------- >>> clean_text("Work Experience: Software Developer at XYZ Corp!") 'work experience software developer xyz corp' """ if not isinstance(raw_text, str): raise TypeError("raw_text must be a string") # Convert to lowercase raw_text = raw_text.lower() # Remove punctuation translator = str.maketrans('', '', string.punctuation) text_no_punctuation = raw_text.translate(translator) # Split into words and remove stop words stop_words = set(stopwords.words('english')) filtered_words = [word for word in text_no_punctuation.split() if word not in stop_words] return " ".join(filtered_words)
[docs] def count_words_in_pdf(file_path): """ Count the frequency of words in a PDF file. This function converts all words to lowercase, removing punctuation, and excluding common English stop words to ensure meaningful word counts. Parameters ---------- file_path : str Path to the PDF file. Returns ------- collections.Counter Dictionary-like object with the frequency of each remaining word where keys are words and values are counts. Examples -------- >>> count_words_in_pdf("cv.pdf") Counter({'work': 1, 'experience': 1, 'software': 1, 'developer': 1, 'at': 1, 'xyz': 1, 'corp': 1, 'education': 1, 'bachelor': 1, 'of': 1, 'science': 1, 'in': 1, 'computer': 1}) """ if not isinstance(file_path, str): raise TypeError("file_path must be a string representing the file path") if not file_path.lower().endswith(".pdf"): raise ValueError("file_path must point to a PDF file") pdf_text = read_pdf(file_path) cleaned_text = clean_text(pdf_text) word_list = cleaned_text.split() return Counter(word_list)