Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import os | |
| import tempfile | |
| # from pypdf import PdfReader, PdfWriter | |
| # from pdf2image import convert_from_path | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain.prompts import PromptTemplate | |
| from langchain_groq import ChatGroq | |
| from langchain.chains.llm import LLMChain | |
| import tempfile | |
| import markdown2 | |
| from weasyprint import HTML | |
| from io import BytesIO | |
| def format_string(input_string): | |
| # Find the index of the first occurrence of ":" | |
| index = input_string.find(":") | |
| # Check if ":" is found | |
| if index != -1: | |
| # Extract the substring starting from the found index to the end | |
| substring = input_string[(index+1):] | |
| else: | |
| # If ":" is not found, return an empty string or an appropriate message | |
| substring = input_string | |
| return substring | |
| def save_uploaded_file(uploadedfile): | |
| # Create a temporary directory to save the file | |
| temp_dir = tempfile.gettempdir() | |
| save_path = os.path.join(temp_dir, uploadedfile.name) | |
| with open(save_path, "wb") as f: | |
| f.write(uploadedfile.getbuffer()) | |
| return save_path | |
| def read_pdf(file_path): | |
| # Dummy processing: copying the original PDF content to a new PDF | |
| loader = PyPDFLoader(file_path) | |
| pages = loader.load_and_split() | |
| text = "" | |
| for page in pages: | |
| text = text + " "+page.page_content+ "\n\n" | |
| return text | |
| def generate_pdf_from_markup(markup_text): | |
| # Convert Markdown to HTML | |
| html_content = markdown2.markdown(markup_text) | |
| # Create a temporary file to save the PDF | |
| temp_dir = tempfile.gettempdir() | |
| pdf_path = os.path.join(temp_dir, "generated.pdf") | |
| # Convert HTML to PDF | |
| HTML(string=html_content).write_pdf(pdf_path) | |
| return pdf_path | |
| def parse_resume(data): | |
| llm = ChatGroq(api_key=os.getenv("GROQ_API_KEY"),model="llama3-70b-8192") | |
| system_prompt = """ | |
| You are an AI assistant designed to remove and format resume data. When provided with extracted text from a PDF resume, your task is to remove personal information and certain details while maintaining the professional content and structure. | |
| Follow the guidelines below: | |
| Keep projects, experience, technical skills as it is without any change. | |
| Remove Salutations: Mr, Mrs, Ms, etc. | |
| Remove Names: All instances of the candidate's names. | |
| Remove Gender: Any mention of gender. | |
| Remove Age/D.O.B./Astrology Info: Any references to age, date of birth, or astrological signs. | |
| Remove Links of personal accounts for example: exail id, github url, linkedin url and all the other urls except the project and experience urls. | |
| Remove email address, mobile number, or any other information that has personal identity. | |
| Anonymize Location: Replace specific locations with more general terms (e.g., "Willing to relocate, currently based in Leicester"). | |
| Anonymize Education Institutions: Replace the names of educational institutions/schools with "top university (e.g. highly reputable university on the global stage) or top school" if applicable. | |
| Anonymize Language Skills: Replace specific languages with regional groupings for multilingual candidates (e.g., "proficient in multiple European languages"). | |
| Remove Hobbies and INTERESTS: Remove specific details related to hobbies and interests | |
| Anonymize Other Fields: Make specific removals as needed to protect the candidate's identity. | |
| Remove professional summary, objective, agenda and all these type of sections. | |
| Keep only related skills ACHIEVEMENTS, awards and certificate which are writen by you. | |
| Ensure the remaining sections and information are formatted properly to maintain the professional appearance of the resume. | |
| Ensure proper formatting of the resume with proper content justifications, add markdown, add bullet points and spacing wherever required. | |
| Return the output of resume content only. Don't include any notes or comments. | |
| """ | |
| # Remove achievment, awards and certifactes that are not related to professional work. | |
| user_prompt_template = """ | |
| {resume_text} | |
| """ | |
| prompt_template = PromptTemplate( | |
| input_variables=["resume_text"], | |
| template=system_prompt + user_prompt_template | |
| ) | |
| anonymize_chain = LLMChain( | |
| llm=llm, | |
| prompt=prompt_template | |
| ) | |
| response=anonymize_chain.invoke(data) | |
| return response | |
| def handle_pdf(file_path): | |
| with st.spinner("Parsing Resume..."): | |
| data = read_pdf(file_path) | |
| modified_data = parse_resume(data) | |
| formatted_data = format_string(modified_data["text"]) | |
| st.write(formatted_data) | |
| pdf_path = "" | |
| print("Formatted text generated") | |
| print(formatted_data) | |
| if formatted_data: | |
| # if st.button("Generate PDF"): | |
| print("Button Clicked") | |
| # Add spinner while generating the PDF | |
| with st.spinner("Generating PDF..."): | |
| # Generate the PDF from markup text | |
| pdf_path = generate_pdf_from_markup(formatted_data) | |
| st.success("PDF generated successfully.") | |
| # Show the preview of the first page of the PDF | |
| with open(pdf_path, "rb") as f: | |
| pdf_bytes = f.read() | |
| st.download_button( | |
| label="Download PDF", | |
| data=pdf_bytes, | |
| file_name="generated.pdf", | |
| mime="application/pdf" | |
| ) | |
| print("AT LAST") | |
| def main(): | |
| st.title("Resume Parser") | |
| option = st.radio( | |
| "Choose an option:", | |
| ("Use Demo PDF", "Browse Files"), | |
| ) | |
| if option == "Use Demo PDF": | |
| demo_pdf_path = "demo.pdf" | |
| st.info("You have selected the demo PDF.") | |
| if st.button("Click to go with Demo pdf"): | |
| handle_pdf(demo_pdf_path) | |
| elif option == "Browse Files": | |
| uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") | |
| if uploaded_file is not None: | |
| original_file_path = save_uploaded_file(uploaded_file) | |
| st.success(f"File saved at {original_file_path}") | |
| handle_pdf(original_file_path) | |
| if __name__ == "__main__": | |
| main() | |