Spaces:

bacancydataprophets
/

resume_parser

Sleeping

App Files Files Community

resume_parser / app.py

gneya-bacancy

Update app.py

ed66a21 verified over 1 year ago

raw

history blame contribute delete

6.27 kB

	import streamlit as st
	import os
	import tempfile
	# from pypdf import PdfReader, PdfWriter
	# from pdf2image import convert_from_path
	from langchain_community.document_loaders import PyPDFLoader
	from langchain.prompts import PromptTemplate
	from langchain_groq import ChatGroq
	from langchain.chains.llm import LLMChain
	import tempfile
	import markdown2
	from weasyprint import HTML
	from io import BytesIO

	def format_string(input_string):
	# Find the index of the first occurrence of ":"
	index = input_string.find(":")

	# Check if ":" is found
	if index != -1:
	# Extract the substring starting from the found index to the end
	substring = input_string[(index+1):]
	else:
	# If ":" is not found, return an empty string or an appropriate message
	substring = input_string
	return substring
	def save_uploaded_file(uploadedfile):
	# Create a temporary directory to save the file
	temp_dir = tempfile.gettempdir()
	save_path = os.path.join(temp_dir, uploadedfile.name)

	with open(save_path, "wb") as f:
	f.write(uploadedfile.getbuffer())

	return save_path

	def read_pdf(file_path):
	# Dummy processing: copying the original PDF content to a new PDF
	loader = PyPDFLoader(file_path)
	pages = loader.load_and_split()
	text = ""
	for page in pages:
	text = text + " "+page.page_content+ "\n\n"
	return text
	def generate_pdf_from_markup(markup_text):
	# Convert Markdown to HTML
	html_content = markdown2.markdown(markup_text)

	# Create a temporary file to save the PDF
	temp_dir = tempfile.gettempdir()
	pdf_path = os.path.join(temp_dir, "generated.pdf")

	# Convert HTML to PDF
	HTML(string=html_content).write_pdf(pdf_path)

	return pdf_path

	def parse_resume(data):
	llm = ChatGroq(api_key=os.getenv("GROQ_API_KEY"),model="llama3-70b-8192")
	system_prompt = """
	You are an AI assistant designed to remove and format resume data. When provided with extracted text from a PDF resume, your task is to remove personal information and certain details while maintaining the professional content and structure.
	Follow the guidelines below:
	Keep projects, experience, technical skills as it is without any change.
	Remove Salutations: Mr, Mrs, Ms, etc.
	Remove Names: All instances of the candidate's names.
	Remove Gender: Any mention of gender.
	Remove Age/D.O.B./Astrology Info: Any references to age, date of birth, or astrological signs.
	Remove Links of personal accounts for example: exail id, github url, linkedin url and all the other urls except the project and experience urls.
	Remove email address, mobile number, or any other information that has personal identity.
	Anonymize Location: Replace specific locations with more general terms (e.g., "Willing to relocate, currently based in Leicester").
	Anonymize Education Institutions: Replace the names of educational institutions/schools with "top university (e.g. highly reputable university on the global stage) or top school" if applicable.
	Anonymize Language Skills: Replace specific languages with regional groupings for multilingual candidates (e.g., "proficient in multiple European languages").
	Remove Hobbies and INTERESTS: Remove specific details related to hobbies and interests
	Anonymize Other Fields: Make specific removals as needed to protect the candidate's identity.
	Remove professional summary, objective, agenda and all these type of sections.
	Keep only related skills ACHIEVEMENTS, awards and certificate which are writen by you.
	Ensure the remaining sections and information are formatted properly to maintain the professional appearance of the resume.
	Ensure proper formatting of the resume with proper content justifications, add markdown, add bullet points and spacing wherever required.
	Return the output of resume content only. Don't include any notes or comments.
	"""
	# Remove achievment, awards and certifactes that are not related to professional work.

	user_prompt_template = """
	{resume_text}
	"""
	prompt_template = PromptTemplate(
	input_variables=["resume_text"],
	template=system_prompt + user_prompt_template
	)
	anonymize_chain = LLMChain(
	llm=llm,
	prompt=prompt_template
	)
	response=anonymize_chain.invoke(data)
	return response

	def handle_pdf(file_path):
	with st.spinner("Parsing Resume..."):
	data = read_pdf(file_path)
	modified_data = parse_resume(data)
	formatted_data = format_string(modified_data["text"])
	st.write(formatted_data)

	pdf_path = ""
	print("Formatted text generated")

	print(formatted_data)
	if formatted_data:
	# if st.button("Generate PDF"):
	print("Button Clicked")
	# Add spinner while generating the PDF
	with st.spinner("Generating PDF..."):
	# Generate the PDF from markup text
	pdf_path = generate_pdf_from_markup(formatted_data)

	st.success("PDF generated successfully.")

	# Show the preview of the first page of the PDF
	with open(pdf_path, "rb") as f:
	pdf_bytes = f.read()
	st.download_button(
	label="Download PDF",
	data=pdf_bytes,
	file_name="generated.pdf",
	mime="application/pdf"
	)
	print("AT LAST")
	def main():
	st.title("Resume Parser")
	option = st.radio(
	"Choose an option:",
	("Use Demo PDF", "Browse Files"),
	)

	if option == "Use Demo PDF":
	demo_pdf_path = "demo.pdf"
	st.info("You have selected the demo PDF.")
	if st.button("Click to go with Demo pdf"):
	handle_pdf(demo_pdf_path)


	elif option == "Browse Files":
	uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

	if uploaded_file is not None:
	original_file_path = save_uploaded_file(uploaded_file)

	st.success(f"File saved at {original_file_path}")

	handle_pdf(original_file_path)

	if __name__ == "__main__":
	main()