Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """app.ipynb | |
| Automatically generated by Colaboratory. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1n1zTe_HIqsQ1JvPcV2S3i8-kjq5V4xJo | |
| """ | |
| #https://huggingface.co/spaces/user2434/SummarizedAbstract | |
| # Import necessary libraries | |
| import gradio as gr | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| from gtts import gTTS | |
| from io import BytesIO | |
| import PyPDF2 | |
| # Function to extract abstract from PDF | |
| def extract_abstract(pdf_path): | |
| with open(pdf_path, 'rb') as file: | |
| reader = PyPDF2.PdfReader(file) | |
| abstract_start, abstract_end = None, None | |
| for page_num, page in enumerate(reader.pages): | |
| page_text = page.extract_text() | |
| if "Abstract" in page_text: | |
| abstract_start = page_num | |
| break | |
| if abstract_start is not None: | |
| for page_num, page in enumerate(reader.pages[abstract_start + 1:]): | |
| page_text = page.extract_text() | |
| if any(title_word in page_text for title_word in ["Introduction", "Background", "1.", "I."]): | |
| abstract_end = abstract_start + page_num + 1 | |
| break | |
| if abstract_start is not None and abstract_end is not None: | |
| abstract_text = ''.join(page.extract_text() for page in reader.pages[abstract_start:abstract_end]) | |
| return abstract_text | |
| else: | |
| return None | |
| # Function to summarize abstract using a pre-trained model | |
| def summarize_abstract(text): | |
| tokenizer = AutoTokenizer.from_pretrained("pszemraj/led-base-book-summary") | |
| model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/led-base-book-summary") | |
| inputs = tokenizer(text, max_length=1000, return_tensors="pt", truncation=True) | |
| summary_ids = model.generate( | |
| inputs['input_ids'], | |
| max_length=40, | |
| min_length=20, | |
| no_repeat_ngram_size=3, | |
| encoder_no_repeat_ngram_size=3, | |
| repetition_penalty=2.0, | |
| num_beams=3, | |
| do_sample=True, | |
| early_stopping=False | |
| ) | |
| summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
| sentences = summary.split('.') | |
| if len(sentences) > 1: | |
| summary = sentences[0] + '.' | |
| return summary | |
| # Function to convert text to speech | |
| def convert_to_speech(text): | |
| tts = gTTS(text, lang='en') | |
| buffer = BytesIO() | |
| tts.write_to_fp(buffer) | |
| buffer.seek(0) | |
| return buffer.read() | |
| # Function to process PDF and generate summary | |
| def process_pdf(pdf_path): | |
| abstract_text = extract_abstract(pdf_path) | |
| if abstract_text: | |
| abstract_text = abstract_text[:1024] | |
| summary = summarize_abstract(abstract_text) | |
| if summary: | |
| return summary, convert_to_speech(summary) | |
| # Define Gradio interface | |
| inputs = gr.File(label="Upload a PDF with an abstract") # Add a label to the file input | |
| summary_text = gr.Text(label="Written summary of the abstract") | |
| audio_summary = gr.Audio(label="Audio summary of abstract") | |
| # Launch the Gradio interface with an example PDF | |
| iface = gr.Interface( | |
| fn=process_pdf, | |
| inputs=inputs, | |
| outputs=[summary_text, audio_summary], | |
| title="Summarized Abstract", | |
| description="The app will summarize the abstract of a PDF and read it to the user.", | |
| examples=["Article 11 Hidden Technical Debt in Machine Learning Systems.pdf" | |
| ] | |
| ) | |
| # Launch the Gradio interface | |
| iface.launch() |