Spaces:
Runtime error
Runtime error
| """TED Talks Summarizer App.""" | |
| from re import sub | |
| from gradio import Interface, Textbox | |
| from requests import get | |
| from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline | |
| repo_id = "pszemraj/led-base-book-summary" | |
| model = AutoModelForSeq2SeqLM.from_pretrained( | |
| repo_id, | |
| low_cpu_mem_usage=True, | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(repo_id) | |
| summarizer = pipeline("summarization", model=model, tokenizer=tokenizer) | |
| def clean_text(text: str) -> str: | |
| """Cleans subtitle text of ted talks. | |
| Args: | |
| text (str): subtitle of ted talk | |
| Returns: | |
| cleaned_text (str): cleaned version of subtitle text | |
| """ | |
| # remove string inside parantheses (i.e appluse) | |
| text = sub(r"\(.*\)", "", text) | |
| # format text by splitting/removing new lines | |
| text = text.split("\n")[1:] | |
| # remove empty strings | |
| text = list(filter(None, text)) | |
| # remove timestamps as they contains pattern of "-->" | |
| cleaned_text = " ".join([x.strip() for x in text if "-->" not in x]) | |
| return cleaned_text | |
| def ted_talk_transcriber(link: str) -> str: | |
| """Creates transcription of ted talks from url. | |
| Args: | |
| link (str): url link of ted talks | |
| Returns: | |
| raw_text (str): raw transcription of the ted talk | |
| """ | |
| # request link of the talk | |
| page = get(link) | |
| # extract unique talk id to reach subtitle file | |
| talk_id = str(page.content).split("project_masters/")[1].split("/")[0] | |
| raw_text = get( | |
| f"https://hls.ted.com/project_masters/{talk_id}/subtitles/en/full.vtt" | |
| ).text | |
| return raw_text | |
| def text_summarizer(text: str) -> str: | |
| """Summarizes given text. | |
| Args: | |
| text (str): ted talks transcription | |
| Returns: | |
| str: summary | |
| """ | |
| result = summarizer( | |
| text, | |
| min_length=8, | |
| max_length=256, | |
| no_repeat_ngram_size=3, | |
| encoder_no_repeat_ngram_size=3, | |
| repetition_penalty=3.5, | |
| num_beams=4, | |
| do_sample=False, | |
| early_stopping=True, | |
| ) | |
| return result[0]["summary_text"] | |
| def main(link: str) -> str: | |
| """Summarizes ted talks given link. | |
| Args: | |
| link (str): url link of ted talks | |
| Returns: | |
| str: summary | |
| """ | |
| raw_text = ted_talk_transcriber(link) | |
| cleaned_transcript = clean_text(raw_text) | |
| return text_summarizer(cleaned_transcript) | |
| logo = "<center><img src='file/TED.png' width=180px></center>" | |
| Interface( | |
| main, | |
| inputs=Textbox(label="Type the TED Talks link"), | |
| examples=[ | |
| "https://www.ted.com/talks/jen_gunter_the_truth_about_yeast_in_your_body" | |
| ], | |
| outputs=Textbox(label="Summary"), | |
| allow_flagging="never", | |
| description=logo, | |
| ).launch() | |