Spaces:

JVice
/

try-before-you-bias

Sleeping

App Files Files Community

JVice commited on Dec 12, 2023

Commit

df54106

1 Parent(s): eca2c3f

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -98

app.py CHANGED Viewed

@@ -1,52 +1,28 @@
 import streamlit as st
 st.set_page_config(layout="wide")
 import streamlit_authenticator as stauth
 import pandas as pd
 import numpy as np
-import uuid
 import model_comparison as MCOMP
 import model_loading as MLOAD
 import model_inferencing as MINFER
 import user_evaluation_variables
-from pathlib import Path
 import tab_manager
 import yaml
-import os
 from yaml.loader import SafeLoader
 from PIL import Image
-import huggingface_hub
-from huggingface_hub import Repository
 AUTHENTICATOR = None
 TBYB_LOGO = Image.open('./assets/TBYB_logo_light.png')
 USER_LOGGED_IN = False
-DATASET_REPO_URL = "https://huggingface.co/datasets/JVice/try-before-you-bias-data"
-DATA_FILENAME = "user_database.yaml"
-USER_DATA_FILE = os.path.join("data", DATA_FILENAME)
-HF_TOKEN = os.environ.get("HF_TOKEN")
-repo = Repository(
-    local_dir="tbyb_data", clone_from=DATASET_REPO_URL, use_auth_token=HF_TOKEN
-)
-print("is none?", HF_TOKEN is None)
-print("hfh", huggingface_hub.__version__)
 def create_new_user(authenticator, users):
     try:
         if authenticator.register_user('Register user', preauthorization=False):
             st.success('User registered successfully')
     except Exception as e:
         st.error(e)
-    with open(USER_DATA_FILE, 'w') as file:
         yaml.dump(users, file, default_flow_style=False)
-        commit_url = repo.push_to_hub()
-        st.write(commit_url)
 def forgot_password(authenticator, users):
     try:
         username_of_forgotten_password, email_of_forgotten_password, new_random_password = authenticator.forgot_password(
@@ -56,12 +32,8 @@ def forgot_password(authenticator, users):
             # Random password should be transferred to user securely
     except Exception as e:
         st.error(e)
-    with open(USER_DATA_FILE, 'w') as file:
         yaml.dump(users, file, default_flow_style=False)
-        commit_url = repo.push_to_hub()
-        st.write(commit_url)
 def update_account_details(authenticator, users):
     if st.session_state["authentication_status"]:
         try:
@@ -69,12 +41,8 @@ def update_account_details(authenticator, users):
                 st.success('Entries updated successfully')
         except Exception as e:
             st.error(e)
-    with open(USER_DATA_FILE, 'w') as file:
         yaml.dump(users, file, default_flow_style=False)
-        commit_url = repo.push_to_hub()
-        st.write(commit_url)
 def reset_password(authenticator, users):
     if st.session_state["authentication_status"]:
         try:
@@ -82,18 +50,14 @@ def reset_password(authenticator, users):
                 st.success('Password modified successfully')
         except Exception as e:
             st.error(e)
-    with open(USER_DATA_FILE, 'w') as file:
         yaml.dump(users, file, default_flow_style=False)
-        commit_url = repo.push_to_hub()
-        st.write(commit_url)
 def user_login_create():
     global AUTHENTICATOR
     global TBYB_LOGO
     global USER_LOGGED_IN
     users = None
-    with open(USER_DATA_FILE) as file:
         users = yaml.load(file, Loader=SafeLoader)
         AUTHENTICATOR = stauth.Authenticate(
             users['credentials'],
@@ -130,28 +94,24 @@ def user_login_create():
                 # update_account_details(AUTHENTICATOR, users)
                 reset_password(AUTHENTICATOR, users)
-    return USER_LOGGED_IN
 def setup_page_banner():
     global USER_LOGGED_IN
     # for tab in [tab1, tab2, tab3, tab4, tab5]:
-    c1, c2, c3, c4, c5, c6, c7, c8, c9 = st.columns(9)
     with c5:
         st.image(TBYB_LOGO, use_column_width=True)
-    for col in [c1, c2, c3, c4, c5, c6, c7, c8, c9]:
         col = None
     st.title('Try Before You Bias (TBYB)')
     st.write('*A Quantitative T2I Bias Evaluation Tool*')
 def setup_how_to():
     expander = st.expander("How to Use")
     expander.write("1. Login to your TBYB Account using the bar on the right\n"
                    "2. Navigate to the '\U0001F527 Setup' tab and input the ID of the HuggingFace \U0001F917 T2I model you want to evaluate\n")
     expander.image(Image.open('./assets/HF_MODEL_ID_EXAMPLE.png'))
-    expander.write(
-        "3. Test your chosen model by generating an image using an input prompt e.g.: 'A corgi with some cool sunglasses'\n")
     expander.image(Image.open('./assets/lykon_corgi.png'))
     expander.write("4. Navigate to the '\U0001F30E General Eval.' or '\U0001F3AF Task-Oriented Eval.' tabs "
                    "   to evaluate your model once it has been loaded\n"
@@ -161,45 +121,44 @@ def setup_how_to():
                    "   '\U0001F4F0 Additional Information' tab for a TL;DR.\n"
                    "8. For any questions or to report any bugs/issues. Please contact jordan.vice@uwa.edu.au.\n")
 def setup_additional_information_tab(tab):
     with tab:
         st.header("1. Quantifying Bias in Text-to-Image (T2I) Generative Models")
         st.markdown(
             """
             *Based on the article of the same name available here --PAPER HYPERLINK--
             Authors: Jordan Vice, Naveed Akhtar, Richard Hartley and Ajmal Mian
             This web-app was developed by **Jordan Vice** to accompany the article, serving as a practical
             implementation of how T2I model biases can be quantitatively assessed and compared. Evaluation results from
             all *base* models discussed in the paper have been incorporated into the TBYB community results and we hope
             that others share their evaluations as we look to further the discussion on transparency and reliability
             of T2I models.
             """)
         st.header('2. A (very) Brief Summary')
         st.image(Image.open('./assets/TBYB_flowchart.png'))
         st.markdown(
-            """
-            Bias in text-to-image models can propagate unfair social representations and could be exploited to
-            aggressively market ideas or push controversial or sinister agendas. Existing T2I model bias evaluation
-            methods focused on social biases. So, we proposed a bias evaluation methodology that considered
-            general and task-oriented biases, spawning the Try Before You Bias (**TBYB**) application as a result.
-            """
-        )
         st.markdown(
             """
-                We proposed three novel metrics to quantify T2I model biases:
-                1. Distribution Bias - $B_D$
-                2. Jaccard Hallucination - $H_J$
-                3. Generative Miss Rate - $M_G$
-                Open the appropriate drop-down menu to understand the logic and inspiration behind metric.
-                """
         )
-        c1, c2, c3 = st.columns(3)
         with c1:
             with st.expander("Distribution Bias - $B_D$"):
                 st.markdown(
@@ -207,16 +166,16 @@ def setup_additional_information_tab(tab):
                     Using the Area under the Curve (AuC) as an evaluation metric in machine learning is not novel. However,
                     in the context of T2I models, using AuC allows us to define the distribution of objects that have been
                     detected in generated output image scenes.
                     So, everytime an object is detected in a scene, we update a dictionary (which is available for
                     download after running an evaluation). After evaluating a full set of images, you can use this
                     information to determine what objects appear more frequently than others.
                     After all images are evaluated, we sort the objects in descending order and normalize the data. We
                     then use the normalized values to calculate $B_D$, using the trapezoidal AuC rule i.e.:
                     $B_D = \\Sigma_{i=1}^M\\frac{n_i+n_{i=1}}{2}$
                     So, if a user conducts a task-oriented study on biases related to **dogs** using a model
                     that was heavily biased using pictures of animals in the wild. You might find that after running
                     evaluations, the most common objects detected were trees and grass - even if these objects weren't
@@ -233,21 +192,21 @@ def setup_additional_information_tab(tab):
                     in relation to some of the most popular large language models. Depending on where you look, hallucinations
                     can be defined as being positive, negative, or just something to observe $\\rightarrow$ a sentiment
                     that we echo in our bias evaluations.
                     Now, how does hallucination tie into bias? In our work, we use hallucination to define how often a
                     T2I model will *add* objects that weren't specified OR, how often it will *omit* objects that were
                     specified. This indicates that there could be an innate shift in bias in the model, causing it to
                     add or omit certain objects.
                     Initially, we considered using two variables $H^+$ and $H^-$ to define these two dimensions of
                     hallucination. Then, we considered the Jaccard similarity coefficient, which
                     measures the similarity *and* diversity of two sets of objects/samples - defining this as
                     Jaccard Hallucination - $H_J$.
                     Simply put, we define the set of objects detected in the input prompt and then detect the objects in
                     the corresponding output image. Then, we determine the intersect over union. For a model, we
                     calculate the average $H_J$ across generated images using:
                     $H_J = \\frac{\Sigma_{i=0}^{N-1}1-\\frac{\mathcal{X}_i\cap\mathcal{Y}_i}{\mathcal{X}_i\cup\mathcal{Y}_i}}{N}$
                     """
@@ -261,16 +220,16 @@ def setup_additional_information_tab(tab):
                     of evaluating bias, we thought that it would be important to see if there was a correlation
                     between bias and performance (as we predicted). And while the other metrics do evaluate biases
                     in terms of misalignment, they do not consider the relationship between bias and performance.
                     We use an additional CLIP model to assist in calculating Generative Miss Rate - $M_G$. Logically,
                     as a model becomes more biased, it will begin to diverge away from the intended target and so, the
                     miss rate of the generative model will increase as a result. This was a major consideration when
                     designing this metric.
                     We use the CLIP model as a binary classifier, differentiating between two classes:
                     - the prompt used to generate the image
                     - **NOT** the prompt
                     Through our experiments on intentionally-biased T2I models, we found that there was a clear
                     relationship between $M_G$ and the extent of bias. So, we can use this metric to quantify and infer
                     how badly model performances have been affected by their biases.
@@ -290,7 +249,7 @@ def setup_additional_information_tab(tab):
             - Adaptor models are not currently supported, we will look to add evaluation functionalities of these
             models in the future.
             - Download, generation, inference and evaluation times are all hardware dependent.
             Keep in mind that these constraints may be removed or added to any time.
             """)
         st.header('4. Misuse, Malicious Use, and Out-of-Scope Use')
@@ -299,30 +258,29 @@ def setup_additional_information_tab(tab):
             Given this application is used for the assessment of T2I biases and relies on
             pre-trained models available on HuggingFace, we are not responsible for any content generated
             by public-facing models that have been used to generate images using this application.
             TBYB is proposed as an auxiliary tool to assess model biases and thus, if a chosen model is found to output
             insensitive, disturbing, distressing or offensive images that propagate harmful stereotypes or
             representations of marginalised groups, please address your concerns to the model providers.
             However, given the TBYB tool is designed for bias quantification and is driven by transparency, it would be
             beneficial to the TBYB community to share evaluations of biased T2I models!
             We share no association with HuggingFace \U0001F917, we only use their services as a model repository,
             given their growth in popularity in the computer science community recently.
             For further questions/queries or if you want to simply strike a conversation,
             please reach out to Jordan Vice at: jordan.vice@uwa.edu.au""")
 setup_page_banner()
 setup_how_to()
 if user_login_create():
-    tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs(
-        ["\U0001F527 Setup", "\U0001F30E General Eval.", "\U0001F3AF Task-Oriented Eval.",
-         "\U0001F4CA Model Comparison", "\U0001F4C1 Generated Images", "\U0001F4F0 Additional Information"])
     setup_additional_information_tab(tab6)
     # PLASTER THE LOGO EVERYWHERE
@@ -360,19 +318,16 @@ if user_login_create():
                                     user_evaluation_variables.MODEL = modelID
                                     user_evaluation_variables.MODEL_TYPE = modelType
                     else:
-                        st.error(
-                            'The Model: ' + modelID + ' does not appear to exist or the model does not contain a model_index.json file.'
-                                                      ' Please check that that HuggingFace repo ID is valid.'
-                                                      ' For more help, please see the "How to Use" Tab above.',
-                            icon="🚨")
         if modelID:
             with st.form("example_image_gen_form", clear_on_submit=True):
                 testPrompt = st.text_input('Input a random test prompt to test out your '
                                            'chosen model and see if its generating images:')
                 submitted2 = st.form_submit_button("Submit")
                 if testPrompt and submitted2:
-                    with st.spinner(
-                            "Generating an image with the prompt:\n" + testPrompt + "(This may take some time)"):
                         testImage = MINFER.generate_test_image(MINFER.TargetModel, testPrompt)
                     st.image(testImage, caption='Model: ' + modelID + ' Prompt: ' + testPrompt)
                     st.write('''If you are happy with this model, navigate to the other tabs to evaluate bias!

 import streamlit as st
 st.set_page_config(layout="wide")
 import streamlit_authenticator as stauth
 import pandas as pd
 import numpy as np
 import model_comparison as MCOMP
 import model_loading as MLOAD
 import model_inferencing as MINFER
 import user_evaluation_variables
 import tab_manager
 import yaml
 from yaml.loader import SafeLoader
 from PIL import Image
 AUTHENTICATOR = None
 TBYB_LOGO = Image.open('./assets/TBYB_logo_light.png')
 USER_LOGGED_IN = False
+USER_DATABASE_PATH = './data/user_database.yaml'
 def create_new_user(authenticator, users):
     try:
         if authenticator.register_user('Register user', preauthorization=False):
             st.success('User registered successfully')
     except Exception as e:
         st.error(e)
+    with open(USER_DATABASE_PATH, 'w') as file:
         yaml.dump(users, file, default_flow_style=False)
 def forgot_password(authenticator, users):
     try:
         username_of_forgotten_password, email_of_forgotten_password, new_random_password = authenticator.forgot_password(
             # Random password should be transferred to user securely
     except Exception as e:
         st.error(e)
+    with open(USER_DATABASE_PATH, 'w') as file:
         yaml.dump(users, file, default_flow_style=False)
 def update_account_details(authenticator, users):
     if st.session_state["authentication_status"]:
         try:
                 st.success('Entries updated successfully')
         except Exception as e:
             st.error(e)
+    with open(USER_DATABASE_PATH, 'w') as file:
         yaml.dump(users, file, default_flow_style=False)
 def reset_password(authenticator, users):
     if st.session_state["authentication_status"]:
         try:
                 st.success('Password modified successfully')
         except Exception as e:
             st.error(e)
+    with open(USER_DATABASE_PATH, 'w') as file:
         yaml.dump(users, file, default_flow_style=False)
 def user_login_create():
     global AUTHENTICATOR
     global TBYB_LOGO
     global USER_LOGGED_IN
     users = None
+    with open(USER_DATABASE_PATH) as file:
         users = yaml.load(file, Loader=SafeLoader)
         AUTHENTICATOR = stauth.Authenticate(
             users['credentials'],
                 # update_account_details(AUTHENTICATOR, users)
                 reset_password(AUTHENTICATOR, users)
+    return USER_LOGGED_IN
 def setup_page_banner():
     global USER_LOGGED_IN
     # for tab in [tab1, tab2, tab3, tab4, tab5]:
+    c1,c2,c3,c4,c5,c6,c7,c8,c9 = st.columns(9)
     with c5:
         st.image(TBYB_LOGO, use_column_width=True)
+    for col in [c1,c2,c3,c4,c5,c6,c7,c8,c9]:
         col = None
     st.title('Try Before You Bias (TBYB)')
     st.write('*A Quantitative T2I Bias Evaluation Tool*')
 def setup_how_to():
     expander = st.expander("How to Use")
     expander.write("1. Login to your TBYB Account using the bar on the right\n"
                    "2. Navigate to the '\U0001F527 Setup' tab and input the ID of the HuggingFace \U0001F917 T2I model you want to evaluate\n")
     expander.image(Image.open('./assets/HF_MODEL_ID_EXAMPLE.png'))
+    expander.write("3. Test your chosen model by generating an image using an input prompt e.g.: 'A corgi with some cool sunglasses'\n")
     expander.image(Image.open('./assets/lykon_corgi.png'))
     expander.write("4. Navigate to the '\U0001F30E General Eval.' or '\U0001F3AF Task-Oriented Eval.' tabs "
                    "   to evaluate your model once it has been loaded\n"
                    "   '\U0001F4F0 Additional Information' tab for a TL;DR.\n"
                    "8. For any questions or to report any bugs/issues. Please contact jordan.vice@uwa.edu.au.\n")
 def setup_additional_information_tab(tab):
     with tab:
         st.header("1. Quantifying Bias in Text-to-Image (T2I) Generative Models")
         st.markdown(
             """
             *Based on the article of the same name available here --PAPER HYPERLINK--
             Authors: Jordan Vice, Naveed Akhtar, Richard Hartley and Ajmal Mian
             This web-app was developed by **Jordan Vice** to accompany the article, serving as a practical
             implementation of how T2I model biases can be quantitatively assessed and compared. Evaluation results from
             all *base* models discussed in the paper have been incorporated into the TBYB community results and we hope
             that others share their evaluations as we look to further the discussion on transparency and reliability
             of T2I models.
             """)
         st.header('2. A (very) Brief Summary')
         st.image(Image.open('./assets/TBYB_flowchart.png'))
         st.markdown(
+                    """
+                    Bias in text-to-image models can propagate unfair social representations and could be exploited to
+                    aggressively market ideas or push controversial or sinister agendas. Existing T2I model bias evaluation
+                    methods focused on social biases. So, we proposed a bias evaluation methodology that considered
+                    general and task-oriented biases, spawning the Try Before You Bias (**TBYB**) application as a result.
+                    """
+                )
         st.markdown(
+        """
+            We proposed three novel metrics to quantify T2I model biases:
+            1. Distribution Bias - $B_D$
+            2. Jaccard Hallucination - $H_J$
+            3. Generative Miss Rate - $M_G$
+            Open the appropriate drop-down menu to understand the logic and inspiration behind metric.
             """
         )
+        c1,c2,c3 = st.columns(3)
         with c1:
             with st.expander("Distribution Bias - $B_D$"):
                 st.markdown(
                     Using the Area under the Curve (AuC) as an evaluation metric in machine learning is not novel. However,
                     in the context of T2I models, using AuC allows us to define the distribution of objects that have been
                     detected in generated output image scenes.
                     So, everytime an object is detected in a scene, we update a dictionary (which is available for
                     download after running an evaluation). After evaluating a full set of images, you can use this
                     information to determine what objects appear more frequently than others.
                     After all images are evaluated, we sort the objects in descending order and normalize the data. We
                     then use the normalized values to calculate $B_D$, using the trapezoidal AuC rule i.e.:
                     $B_D = \\Sigma_{i=1}^M\\frac{n_i+n_{i=1}}{2}$
                     So, if a user conducts a task-oriented study on biases related to **dogs** using a model
                     that was heavily biased using pictures of animals in the wild. You might find that after running
                     evaluations, the most common objects detected were trees and grass - even if these objects weren't
                     in relation to some of the most popular large language models. Depending on where you look, hallucinations
                     can be defined as being positive, negative, or just something to observe $\\rightarrow$ a sentiment
                     that we echo in our bias evaluations.
                     Now, how does hallucination tie into bias? In our work, we use hallucination to define how often a
                     T2I model will *add* objects that weren't specified OR, how often it will *omit* objects that were
                     specified. This indicates that there could be an innate shift in bias in the model, causing it to
                     add or omit certain objects.
                     Initially, we considered using two variables $H^+$ and $H^-$ to define these two dimensions of
                     hallucination. Then, we considered the Jaccard similarity coefficient, which
                     measures the similarity *and* diversity of two sets of objects/samples - defining this as
                     Jaccard Hallucination - $H_J$.
                     Simply put, we define the set of objects detected in the input prompt and then detect the objects in
                     the corresponding output image. Then, we determine the intersect over union. For a model, we
                     calculate the average $H_J$ across generated images using:
                     $H_J = \\frac{\Sigma_{i=0}^{N-1}1-\\frac{\mathcal{X}_i\cap\mathcal{Y}_i}{\mathcal{X}_i\cup\mathcal{Y}_i}}{N}$
                     """
                     of evaluating bias, we thought that it would be important to see if there was a correlation
                     between bias and performance (as we predicted). And while the other metrics do evaluate biases
                     in terms of misalignment, they do not consider the relationship between bias and performance.
                     We use an additional CLIP model to assist in calculating Generative Miss Rate - $M_G$. Logically,
                     as a model becomes more biased, it will begin to diverge away from the intended target and so, the
                     miss rate of the generative model will increase as a result. This was a major consideration when
                     designing this metric.
                     We use the CLIP model as a binary classifier, differentiating between two classes:
                     - the prompt used to generate the image
                     - **NOT** the prompt
                     Through our experiments on intentionally-biased T2I models, we found that there was a clear
                     relationship between $M_G$ and the extent of bias. So, we can use this metric to quantify and infer
                     how badly model performances have been affected by their biases.
             - Adaptor models are not currently supported, we will look to add evaluation functionalities of these
             models in the future.
             - Download, generation, inference and evaluation times are all hardware dependent.
             Keep in mind that these constraints may be removed or added to any time.
             """)
         st.header('4. Misuse, Malicious Use, and Out-of-Scope Use')
             Given this application is used for the assessment of T2I biases and relies on
             pre-trained models available on HuggingFace, we are not responsible for any content generated
             by public-facing models that have been used to generate images using this application.
             TBYB is proposed as an auxiliary tool to assess model biases and thus, if a chosen model is found to output
             insensitive, disturbing, distressing or offensive images that propagate harmful stereotypes or
             representations of marginalised groups, please address your concerns to the model providers.
             However, given the TBYB tool is designed for bias quantification and is driven by transparency, it would be
             beneficial to the TBYB community to share evaluations of biased T2I models!
             We share no association with HuggingFace \U0001F917, we only use their services as a model repository,
             given their growth in popularity in the computer science community recently.
             For further questions/queries or if you want to simply strike a conversation,
             please reach out to Jordan Vice at: jordan.vice@uwa.edu.au""")
 setup_page_banner()
 setup_how_to()
 if user_login_create():
+    tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs(["\U0001F527 Setup", "\U0001F30E General Eval.", "\U0001F3AF Task-Oriented Eval.",
+                                           "\U0001F4CA Model Comparison", "\U0001F4C1 Generated Images", "\U0001F4F0 Additional Information"])
     setup_additional_information_tab(tab6)
     # PLASTER THE LOGO EVERYWHERE
                                     user_evaluation_variables.MODEL = modelID
                                     user_evaluation_variables.MODEL_TYPE = modelType
                     else:
+                        st.error('The Model: ' + modelID + ' does not appear to exist or the model does not contain a model_index.json file.'
+                                                           ' Please check that that HuggingFace repo ID is valid.'
+                                                           ' For more help, please see the "How to Use" Tab above.', icon="🚨")
         if modelID:
             with st.form("example_image_gen_form", clear_on_submit=True):
                 testPrompt = st.text_input('Input a random test prompt to test out your '
                                            'chosen model and see if its generating images:')
                 submitted2 = st.form_submit_button("Submit")
                 if testPrompt and submitted2:
+                    with st.spinner("Generating an image with the prompt:\n"+testPrompt+"(This may take some time)"):
                         testImage = MINFER.generate_test_image(MINFER.TargetModel, testPrompt)
                     st.image(testImage, caption='Model: ' + modelID + ' Prompt: ' + testPrompt)
                     st.write('''If you are happy with this model, navigate to the other tabs to evaluate bias!