[docs]defread_pdf(file_path):""" Extract text content from a PDF file and return it as a single consolidated string. Parameters ---------- file_path : str Path to the PDF file. Returns ------- str PDF file contents as text. Examples -------- >>> read_pdf("cv.pdf") 'Work Experience\nSoftware Developer at XYZ Corp.\nEducation\nBachelor of Science in Computer Science\n' """ifnotisinstance(file_path,str):raiseTypeError("file_path must be a string representing the file path")ifnotfile_path.lower().endswith(".pdf"):raiseValueError("file_path must point to a PDF file")text_content=[]try:withopen(file_path,'rb')aspdf_file:reader=PyPDF2.PdfReader(pdf_file)forpageinreader.pages:text_content.append(page.extract_text())return"".join(text_content).replace("\n","")exceptFileNotFoundErrorase:raiseFileNotFoundError(f"The file {file_path} does not exist.")fromeexceptExceptionase:raiseValueError(f"Error reading the PDF file: {e}")
[docs]defclean_text(raw_text):""" Convert raw_text to lowercase, remove punctuation, and filter out common English stop words to retain only meaningful words in the string. Parameters ---------- raw_text : str Text to clean. Returns ------- str Cleaned text. Examples -------- >>> clean_text("Work Experience: Software Developer at XYZ Corp!") 'work experience software developer xyz corp' """ifnotisinstance(raw_text,str):raiseTypeError("raw_text must be a string")# Convert to lowercaseraw_text=raw_text.lower()# Remove punctuationtranslator=str.maketrans('','',string.punctuation)text_no_punctuation=raw_text.translate(translator)# Split into words and remove stop wordsstop_words=set(stopwords.words('english'))filtered_words=[wordforwordintext_no_punctuation.split()ifwordnotinstop_words]return" ".join(filtered_words)
[docs]defcount_words_in_pdf(file_path):""" Count the frequency of words in a PDF file. This function converts all words to lowercase, removing punctuation, and excluding common English stop words to ensure meaningful word counts. Parameters ---------- file_path : str Path to the PDF file. Returns ------- collections.Counter Dictionary-like object with the frequency of each remaining word where keys are words and values are counts. Examples -------- >>> count_words_in_pdf("cv.pdf") Counter({'work': 1, 'experience': 1, 'software': 1, 'developer': 1, 'at': 1, 'xyz': 1, 'corp': 1, 'education': 1, 'bachelor': 1, 'of': 1, 'science': 1, 'in': 1, 'computer': 1}) """ifnotisinstance(file_path,str):raiseTypeError("file_path must be a string representing the file path")ifnotfile_path.lower().endswith(".pdf"):raiseValueError("file_path must point to a PDF file")pdf_text=read_pdf(file_path)cleaned_text=clean_text(pdf_text)word_list=cleaned_text.split()returnCounter(word_list)