Commit 06c85d23 authored by Decoupes Remy's avatar Decoupes Remy
Browse files

EDA on empres-i and OIE WAHIS

parent 85a03e44
# ignore elastic data and log
./data
./log
# external sources
empres-i-overview-raw-data_202202082233.csv
oie_wahis_HPAI.xlsx
oie_wahis_HPAI_LPAI_France.xlsx
\ No newline at end of file
......@@ -19,6 +19,7 @@ from plotly.subplots import make_subplots
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.manifold import TSNE
import geopandas as gpd
def logsetup():
"""
......@@ -31,7 +32,7 @@ def logsetup():
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s :: %(levelname)s :: %(message)s')
file_handler = RotatingFileHandler(
'/home/rdecoupe/PycharmProjects/mood-tweets-collect/elasticsearch/log/eda/eda.log', 'a', 1000000, 1)
'elasticsearch/log/eda/eda.log', 'a', 1000000, 1)
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
......@@ -199,7 +200,15 @@ def get_tweet_content_by_disease(jinja_env, es_url, index_es, list_of_keywords,
# df_results.to_pickle("/home/rdecoupe/Téléchargements/test/get_tweet_content_by_disease.pkl")
return df_results
def visualize_sentence_through_embedding(df_corpus, multi_langue=False):
def visualize_sentence_through_embedding(df_corpus, multi_langue=False, fig_title=""):
"""
Plot a scatter plot of embedding (dim reduced by T-SNE)
:param df_corpus: corpus of tweet
:param multi_langue: True / False
:param fig_title: Title of the fig
:return:
"""
corpus_tweets_list = df_corpus.text.values.tolist()
corpus_tweets_list = list(set(corpus_tweets_list)) #Remove duplicate tweets (mostly RT)
# How to choose the model: An overview on huggingface
......@@ -208,9 +217,10 @@ def visualize_sentence_through_embedding(df_corpus, multi_langue=False):
best_quality_model = "all-mpnet-base-v2"
faster_model = "all-MiniLM-L6-v2" # used by UKPLab : https://github.com/UKPLab/sentence-transformers/blob/a94030226da1f4b03f9d703596b0ebd360c9ef43/examples/applications/clustering/agglomerative.py#L33
if multi_langue:
embedder = SentenceTransformer(multi_lingual_model)
model_used = multi_lingual_model
else:
embedder = SentenceTransformer(faster_model)
model_used = faster_model
embedder = SentenceTransformer(model_used)
# Encode !
corpus_embeddings = embedder.encode(corpus_tweets_list)
# Normalize the embeddings to unit length
......@@ -221,15 +231,19 @@ def visualize_sentence_through_embedding(df_corpus, multi_langue=False):
corpus_embeddings_tsne_df = pd.DataFrame(corpus_embeddings_tsne)
corpus_embeddings_tsne_df["label"] = corpus_tweets_list
corpus_embeddings_tsne_df["country"] = df_corpus.country
corpus_embeddings_tsne_df["timestamp"] = df_corpus["timestamp"]
# plot with plotly express
fig = px.scatter(
corpus_embeddings_tsne_df, x=0, y=1,
hover_data=["label"], color="country"
hover_data=["label", "timestamp"], color="country",
title=fig_title + " | model: "+str(model_used)
)
fig.show()
if __name__ == '__main__':
# Change working directory to the root of the repo, ie, mood_tweets_collect
os.chdir('../..')
logger = logsetup()
logger.info("EDA start")
path_figs_dir = os.path.join(os.path.dirname(__file__), "figs")
......@@ -251,7 +265,7 @@ if __name__ == '__main__':
"""
Count tweets by disease
"""
path_param_keywords = "/home/rdecoupe/PycharmProjects/mood-tweets-collect/params/keywordsFilter.csv"
path_param_keywords = "params/keywordsFilter.csv"
params_kw = pd.read_csv(path_param_keywords)
logger.info("Count tweets by disease: browse syndrome")
df_kw_by_disease = ""
......@@ -290,7 +304,7 @@ if __name__ == '__main__':
"""
Time series of keywords (except covid)
"""
path_param_keywords = "/home/rdecoupe/PycharmProjects/mood-tweets-collect/params/keywordsFilter.csv"
path_param_keywords = "params/keywordsFilter.csv"
params_kw = pd.read_csv(path_param_keywords)
logger.info("time series by disease")
df_kw_by_disease = ""
......@@ -314,7 +328,8 @@ if __name__ == '__main__':
subplots_timeseries_fig = make_subplots(rows=nb_of_subplots, cols=1)
current_row = 0
for disease in list_disease_with_timeserie:
timeserie_fig = px.bar(df_kw_by_disease[disease], facet_col_wrap=2)
timeserie_fig = px.bar(df_kw_by_disease[disease], facet_col_wrap=2,
title="Time series for "+str(disease))
current_row = current_row + 1
# timeserie_fig.show()
"""
......@@ -328,7 +343,7 @@ if __name__ == '__main__':
"""
Clustering tweets for a disease name
"""
path_param_keywords = "/home/rdecoupe/PycharmProjects/mood-tweets-collect/params/keywordsFilter.csv"
path_param_keywords = "params/keywordsFilter.csv"
params_kw = pd.read_csv(path_param_keywords)
logger.info("Work around sentence embedding")
# For Avian Influenza:
......@@ -336,12 +351,55 @@ if __name__ == '__main__':
list_of_keywords = ['Fowl', 'Bird', 'Avian', 'HPAI', 'FowlPlague', 'AvianInfluenza', 'avianInfluenza',
'Avianflu', 'bird', 'BirdFlu']
corpus_tweets = get_tweet_content_by_disease(jinja_env, es_url, index_es, list_of_keywords, disease)
visualize_sentence_through_embedding(corpus_tweets)
visualize_sentence_through_embedding(corpus_tweets, fig_title="Avian flu all kw")
# for Influenza
disease = "Influenza"
list_of_keywords = ['Flu', 'Influenza', 'influenzavirus', 'InfluenzaVirus', 'H1N1', 'H2N2', 'H3N2', 'H3N8', 'H5N1',
'H5N2', 'H7N7', 'H9N2', 'H1N2', 'H7N1', 'H7N2', 'H7N3', 'H10N7', 'H7N9', 'H10N8', 'H5N8']
corpus_tweets = get_tweet_content_by_disease(jinja_env, es_url, index_es, list_of_keywords, disease)
visualize_sentence_through_embedding(corpus_tweets)
visualize_sentence_through_embedding(corpus_tweets, multi_langue=True)
visualize_sentence_through_embedding(corpus_tweets, fig_title="Influenza all kw")
visualize_sentence_through_embedding(corpus_tweets, fig_title="Influenza all kw", multi_langue=True)
# Grippe aviaire
list_of_keywords = ['aviaire']
corpus_tweets = get_tweet_content_by_disease(jinja_env, es_url, index_es, list_of_keywords, disease)
visualize_sentence_through_embedding(corpus_tweets, fig_title="Aviaire")
visualize_sentence_through_embedding(corpus_tweets, fig_title="Aviaire", multi_langue=True)
logger.info("EDA stop")
"""
External sources EDA
"""
# Load Empres-i data https://empres-i.apps.fao.org/
empresi_csv_export = "elasticsearch/external_sources/empres-i-overview-raw-data_202202082233.csv"
df_empres_i = pd.read_csv(empresi_csv_export, header=10, sep=",")
df_empres_i["Observation.date..dd.mm.yyyy."] = pd.to_datetime(df_empres_i["Observation.date..dd.mm.yyyy."])
fig = px.bar(df_empres_i, x=df_empres_i["Observation.date..dd.mm.yyyy."],
hover_data=["Species", "Locality"],
title="Empres-I: nb of report World")
fig.show()
geo_df_empres_i = gpd.GeoDataFrame(df_empres_i,
geometry=gpd.points_from_xy(df_empres_i.Longitude, df_empres_i.Latitude))
fig2 = px.scatter_geo(geo_df_empres_i, lat= geo_df_empres_i.geometry.y, lon=geo_df_empres_i.geometry.x,
title="Empres-I: nb of report World")
fig2.show()
# France
df_empres_i = df_empres_i[df_empres_i["Country"] == "France"]
fig = px.bar(df_empres_i, x=df_empres_i["Observation.date..dd.mm.yyyy."],
hover_data=["Species", "Locality"],
title="Empres-I: nb of report France")
fig.show()
geo_df_empres_i = gpd.GeoDataFrame(df_empres_i,
geometry=gpd.points_from_xy(df_empres_i.Longitude, df_empres_i.Latitude))
fig2 = px.scatter_geo(geo_df_empres_i, lat= geo_df_empres_i.geometry.y, lon=geo_df_empres_i.geometry.x,
title="Empres-I: nb of report France")
fig2.show()
# Same with OIE WAHIS : https://wahis.oie.int/#/dashboards/qd-dashboard
oie_df = pd.read_excel("elasticsearch/external_sources/oie_wahis_HPAI_LPAI_France.xlsx", engine='openpyxl')
cols_concat = ["Semester", "Administrative Division", "Animal Category"]
oie_df["combined"] = oie_df[cols_concat].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
fig_oie_france = px.bar(oie_df, x="combined", y=["New outbreaks","Susceptible", "Cases", "Killed and disposed of",
"Slaughtered", "Deaths", "Vaccinated"],
title="OIE WAHIS in France")
fig_oie_france.show()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment