Spaces:

WordLift
/

entity-linking

Running

App Files Files

cyberandy commited on Jun 19

Commit

e8957f4

verified ·

1 Parent(s): 87c144f

Update app.py

Browse files

Files changed (1) hide show

app.py +167 -123

app.py CHANGED Viewed

@@ -20,15 +20,13 @@ st.set_page_config(
 # Sidebar
 st.sidebar.image("logo-wordlift.png")
 language_options = {"English", "English - spaCy", "German"}
-# Set default to English to avoid an error on the first run
 selected_language = st.sidebar.selectbox("Select the Language", list(language_options), index=0)
-# Initialize model and entity set variables
-selected_model_name = None
-selected_entity_set = None
 # Based on selected language, configure model, entity set, and citation options
 if selected_language == "German" or selected_language == "English - spaCy":
     entity_fishing_citation = """
     @misc{entity-fishing,
     title = {entity-fishing},
@@ -38,12 +36,14 @@ if selected_language == "German" or selected_language == "English - spaCy":
     eprint = {1:dir:cb0ba3379413db12b0018b7c3af8d0d2d864139c}
     }
     """
     with st.sidebar.expander('Citations'):
         st.markdown(entity_fishing_citation)
-else: # English (Refined)
     model_options = ["aida_model", "wikipedia_model_with_numbers"]
     entity_set_options = ["wikidata", "wikipedia"]
     selected_model_name = st.sidebar.selectbox("Select the Model", model_options)
     selected_entity_set = st.sidebar.selectbox("Select the Entity Set", entity_set_options)
@@ -55,152 +55,196 @@ else: # English (Refined)
     year = "2022"
     }
     """
     with st.sidebar.expander('Citations'):
         st.markdown(refined_citation)
-@st.cache_resource
 def load_model(selected_language, model_name=None, entity_set=None):
-    # Define the public URL for the entity-fishing service
-    entity_fishing_url = "https://cloud.science-miner.com/nerd/service"
-    if selected_language == "German":
-        # Load the German-specific model
-        nlp_model_de = spacy.load("de_core_news_lg")
-        # Add the entity-fishing pipe with the server URL configured
-        nlp_model_de.add_pipe("entityfishing", config={"api_url": entity_fishing_url})
-        return nlp_model_de
-    elif selected_language == "English - spaCy":
-        # Load English-specific model
-        nlp_model_en = spacy.load("en_core_web_sm")
-        # Add the entity-fishing pipe with the server URL configured
-        nlp_model_en.add_pipe("entityfishing", config={"api_url": entity_fishing_url})
-        return nlp_model_en
-    else: # English (Refined)
-        # Load the pretrained model for other languages
-        refined_model = Refined.from_pretrained(model_name=model_name, entity_set=entity_set)
-        return refined_model
 # Use the cached model
-# We pass the selected options directly to the cached function
-# Streamlit's caching handles re-running this only when the inputs change
 model = load_model(selected_language, selected_model_name, selected_entity_set)
 # Helper functions
-def get_wikidata_id(entity_id_string):
-    # Handles IDs like "wikidata:Q123" or "wikidata=Q123"
-    entity_id = entity_id_string.split(":")[-1].split("=")[-1]
     entity_link = "http://www.wikidata.org/entity/" + entity_id
     return {"id": entity_id, "link": entity_link}
 def get_entity_data(entity_link):
     try:
         # Format the entity_link
         formatted_link = entity_link.replace("http://", "http/")
         response = requests.get(f'https://api.wordlift.io/id/{formatted_link}')
-        response.raise_for_status() # Raise an exception for bad status codes
         return response.json()
-    except requests.exceptions.RequestException as e:
-        st.warning(f"Could not fetch data for entity: {entity_link}. Error: {e}")
         return None
 # Create the form
 with st.form(key='my_form'):
-    text_input = st.text_area(label='Enter a sentence', value="Angela Merkel was the first female chancellor of Germany.")
     submit_button = st.form_submit_button(label='Analyze')
 # Initialization
 entities_map = {}
 entities_data = {}
-if text_input:
-    if selected_language in ["German", "English - spaCy"]:
-        doc = model(text_input)
-        spacy_entities = [(ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata) for ent in doc.ents]
-        for entity in spacy_entities:
-            entity_string, entity_type, wikidata_id, wikidata_url = entity
-            if wikidata_url:
-                formatted_wikidata_url = wikidata_url.replace("https://www.wikidata.org/wiki/", "http://www.wikidata.org/entity/")
-                entities_map[entity_string] = {"id": wikidata_id, "link": formatted_wikidata_url}
-                entity_data = get_entity_data(formatted_wikidata_url)
-                if entity_data is not None:
-                    entities_data[entity_string] = entity_data
-    else: # Refined model
-        refined_entities = model.process_text(text_input)
-        for entity in refined_entities:
-            # More robustly access entity attributes instead of parsing a string
-            if entity.entity_id and "wikidata" in entity.entity_id:
-                entity_text = entity.text
-                wikidata_info = get_wikidata_id(entity.entity_id)
-                entities_map[entity_text] = wikidata_info
-                entity_data = get_entity_data(wikidata_info["link"])
-                if entity_data is not None:
-                    entities_data[entity_text] = entity_data
-    combined_entity_info_dictionary = {
-        k: [entities_map[k], entities_data.get(k)] for k in entities_map
-    }
-    if submit_button:
-        # A more robust way to build the annotated_text list without using eval()
-        final_text = []
-        current_pos = 0
-        # Create a simple list of (text, start, end) for sorting
-        entity_spans = []
         if selected_language in ["German", "English - spaCy"]:
-            # 'doc' is available from the processing block above
             for ent in doc.ents:
-                if ent.text in entities_map: # only include linked entities
-                    entity_spans.append((ent.text, ent.start_char, ent.end_char))
         else:
-            # 'refined_entities' is available
-             for ent in refined_entities:
-                if ent.text in entities_map:
-                    entity_spans.append((ent.text, ent.span[0], ent.span[1]))
-        # Sort entities by their starting position to handle the text correctly
-        sorted_entities = sorted(entity_spans, key=lambda x: x[1])
-        for entity_string, start, end in sorted_entities:
-            # Add the text segment before the current entity
-            final_text.append(text_input[current_pos:start])
-            # Prepare the annotation for the entity
-            entity_info = entities_map.get(entity_string, {})
-            entity_id = entity_info.get("id", "N/A")
-            entity_type_data = entities_data.get(entity_string)
-            entity_type = entity_type_data.get("@type") if entity_type_data else None
-            color = {"Place": "#8AC7DB", "Organization": "#ADD8E6", "Person": "#67B7D1",
-                     "Product": "#2ea3f2", "CreativeWork": "#00BFFF", "Event": "#1E90FF"}.get(entity_type, "#8ef")
-            final_text.append((entity_string, entity_id, color))
-            current_pos = end
-        # Add any remaining text after the last entity
-        final_text.append(text_input[current_pos:])
-        st.header("Annotated Text")
-        annotated_text(*[item for item in final_text if item]) # Filter out empty strings
-        # --- JSON-LD Generation ---
-        json_ld_data = {
-                "@context": "https://schema.org",
-                "@type": "WebPage",
-                "mentions": []
-            }
-        for entity_string, info_list in combined_entity_info_dictionary.items():
-            entity_json_ld = info_list[1] # The data from WordLift API
-            if entity_json_ld:
-                 json_ld_data["mentions"].append(entity_json_ld)
-        with st.expander("See annotations"):
-            st.write(combined_entity_info_dictionary)
-        with st.expander("Here is the final JSON-LD"):
-            st.json(json_ld_data)

 # Sidebar
 st.sidebar.image("logo-wordlift.png")
 language_options = {"English", "English - spaCy", "German"}
 selected_language = st.sidebar.selectbox("Select the Language", list(language_options), index=0)
 # Based on selected language, configure model, entity set, and citation options
 if selected_language == "German" or selected_language == "English - spaCy":
+    selected_model_name = None
+    selected_entity_set = None
     entity_fishing_citation = """
     @misc{entity-fishing,
     title = {entity-fishing},
     eprint = {1:dir:cb0ba3379413db12b0018b7c3af8d0d2d864139c}
     }
     """
     with st.sidebar.expander('Citations'):
         st.markdown(entity_fishing_citation)
+else:
     model_options = ["aida_model", "wikipedia_model_with_numbers"]
     entity_set_options = ["wikidata", "wikipedia"]
     selected_model_name = st.sidebar.selectbox("Select the Model", model_options)
     selected_entity_set = st.sidebar.selectbox("Select the Entity Set", entity_set_options)
     year = "2022"
     }
     """
     with st.sidebar.expander('Citations'):
         st.markdown(refined_citation)
+@st.cache_resource  # 👈 Add the caching decorator
 def load_model(selected_language, model_name=None, entity_set=None):
+    try:
+        if selected_language == "German":
+            # Load the German-specific model
+            nlp_model_de = spacy.load("de_core_news_lg")
+            # Check if entityfishing component is available
+            if "entityfishing" not in nlp_model_de.pipe_names:
+                try:
+                    nlp_model_de.add_pipe("entityfishing")
+                except Exception as e:
+                    st.error(f"Error adding entityfishing component: {e}")
+                    st.error("Please make sure entity-fishing is properly installed and configured.")
+                    return None
+            return nlp_model_de
+        elif selected_language == "English - spaCy":
+            # Load English-specific model
+            nlp_model_en = spacy.load("en_core_web_sm")
+            # Check if entityfishing component is available
+            if "entityfishing" not in nlp_model_en.pipe_names:
+                try:
+                    nlp_model_en.add_pipe("entityfishing")
+                except Exception as e:
+                    st.error(f"Error adding entityfishing component: {e}")
+                    st.error("Please make sure entity-fishing is properly installed and configured.")
+                    return None
+            return nlp_model_en
+        else:
+            # Load the pretrained model for other languages
+            refined_model = Refined.from_pretrained(model_name=model_name, entity_set=entity_set)
+            return refined_model
+    except Exception as e:
+        st.error(f"Error loading model: {e}")
+        return None
 # Use the cached model
 model = load_model(selected_language, selected_model_name, selected_entity_set)
 # Helper functions
+def get_wikidata_id(entity_string):
+    entity_list = entity_string.split("=")
+    entity_id = str(entity_list[1])
     entity_link = "http://www.wikidata.org/entity/" + entity_id
     return {"id": entity_id, "link": entity_link}
 def get_entity_data(entity_link):
     try:
         # Format the entity_link
         formatted_link = entity_link.replace("http://", "http/")
         response = requests.get(f'https://api.wordlift.io/id/{formatted_link}')
         return response.json()
+    except Exception as e:
+        print(f"Exception when fetching data for entity: {entity_link}. Exception: {e}")
         return None
 # Create the form
 with st.form(key='my_form'):
+    text_input = st.text_area(label='Enter a sentence')
     submit_button = st.form_submit_button(label='Analyze')
 # Initialization
 entities_map = {}
 entities_data = {}
+if text_input and model is not None:
+    try:
         if selected_language in ["German", "English - spaCy"]:
+            # Process the text with error handling
+            doc = model(text_input)
+            # Fixed the syntax error: ent._.kb_qid instead of ent..kb_qid
+            entities = []
             for ent in doc.ents:
+                try:
+                    # Check if the custom attributes exist
+                    kb_qid = getattr(ent._, 'kb_qid', None) if hasattr(ent, '_') else None
+                    url_wikidata = getattr(ent._, 'url_wikidata', None) if hasattr(ent, '_') else None
+                    entities.append((ent.text, ent.label_, kb_qid, url_wikidata))
+                except AttributeError as e:
+                    # If the entityfishing attributes don't exist, use basic entity info
+                    entities.append((ent.text, ent.label_, None, None))
+            for entity in entities:
+                entity_string, entity_type, wikidata_id, wikidata_url = entity
+                if wikidata_url:
+                    # Ensure correct format for the German and English model
+                    formatted_wikidata_url = wikidata_url.replace("https://www.wikidata.org/wiki/", "http://www.wikidata.org/entity/")
+                    entities_map[entity_string] = {"id": wikidata_id, "link": formatted_wikidata_url}
+                    entity_data = get_entity_data(formatted_wikidata_url)
+                    if entity_data is not None:
+                        entities_data[entity_string] = entity_data
         else:
+            entities = model.process_text(text_input)
+            for entity in entities:
+                single_entity_list = str(entity).strip('][').replace("\'", "").split(', ')
+                if len(single_entity_list) >= 2 and "wikidata" in single_entity_list[1]:
+                    entities_map[single_entity_list[0].strip()] = get_wikidata_id(single_entity_list[1])
+                    entity_data = get_entity_data(entities_map[single_entity_list[0].strip()]["link"])
+                    if entity_data is not None:
+                        entities_data[single_entity_list[0].strip()] = entity_data
+    except Exception as e:
+        st.error(f"Error processing text: {e}")
+        if "entityfishing" in str(e).lower():
+            st.error("This appears to be an entity-fishing related error. Please ensure:")
+            st.error("1. Entity-fishing service is running")
+            st.error("2. spacyfishing package is properly installed")
+            st.error("3. Network connectivity to entity-fishing service")
+# Combine entity information
+combined_entity_info_dictionary = dict([(k, [entities_map[k], entities_data[k] if k in entities_data else None]) for k in entities_map])
+if submit_button and entities_map:
+    # Prepare a list to hold the final output
+    final_text = []
+    # JSON-LD data
+    json_ld_data = {
+            "@context": "https://schema.org",
+            "@type": "WebPage",
+            "mentions": []
+        }
+   # Replace each entity in the text with its annotated version
+    for entity_string, entity_info in entities_map.items():
+        # Check if the entity has a valid Wikidata link
+        if entity_info["link"] is None or entity_info["link"] == "None":
+            continue  # skip this entity
+        entity_data = entities_data.get(entity_string, None)
+        entity_type = None
+        if entity_data is not None:
+            entity_type = entity_data.get("@type", None)
+        # Use different colors based on the entity's type
+        color = "#8ef"  # Default color
+        if entity_type == "Place":
+            color = "#8AC7DB"
+        elif entity_type == "Organization":
+            color = "#ADD8E6"
+        elif entity_type == "Person":
+            color = "#67B7D1"
+        elif entity_type == "Product":
+            color = "#2ea3f2"
+        elif entity_type == "CreativeWork":
+            color = "#00BFFF"
+        elif entity_type == "Event":
+            color = "#1E90FF"
+        entity_annotation = (entity_string, entity_info["id"], color)
+        text_input = text_input.replace(entity_string, f'{{{str(entity_annotation)}}}', 1)
+        # Add the entity to JSON-LD data
+        entity_json_ld = combined_entity_info_dictionary[entity_string][1]
+        if entity_json_ld and entity_json_ld.get("link") != "None":
+            json_ld_data["mentions"].append(entity_json_ld)
+    # Split the modified text_input into a list
+    text_list = text_input.split("{")
+    for item in text_list:
+        if "}" in item:
+            item_list = item.split("}")
+            try:
+                final_text.append(eval(item_list[0]))
+            except:
+                final_text.append(item_list[0])
+            if len(item_list) > 1 and len(item_list[1]) > 0:
+                final_text.append(item_list[1])
+        else:
+            final_text.append(item)
+    # Pass the final_text to the annotated_text function
+    annotated_text(*final_text)
+    with st.expander("See annotations"):
+        st.write(combined_entity_info_dictionary)
+    with st.expander("Here is the final JSON-LD"):
+        st.json(json_ld_data)  # Output JSON-LD
+elif submit_button and not entities_map:
+    st.warning("No entities found in the text. Please try with different text or check if the model is working correctly.")