Commit 6874b2bd authored by Decoupes Remy's avatar Decoupes Remy
Browse files

add different crises in query

parent 7a758c5e
......@@ -113,6 +113,12 @@ if __name__ == '__main__':
state = 'aquitaine'
start_date = "2020-12-01T00:00:00.172Z"
end_date = "2022-02-25T15:51:46.172Z"
filters = {
'state' : ["aquitaine", "aquitaine", "occitanie"],
'start_date' : ["2020-12-01T00:00:00.172Z", "2021-04-01T00:00:00.172Z", "2021-01-01T00:00:00.172Z"],
'end_date' : ["2021-12-31T23:00:00.172Z", "2022-02-25T15:51:46.172Z", "2021-07-31T23:00:00.172Z"]
}
df_filters = pd.DataFrame(filters)
# connect to elastic
""" Why not using eland ?
# we could not filter eland df with rest.features.properties.state certainly because there are to much fields ?
......@@ -131,81 +137,81 @@ if __name__ == '__main__':
template_dir = os.path.join(os.path.dirname(__file__), "eda_templates")
jinja_env = Environment(loader=FileSystemLoader(template_dir))
template = jinja_env.get_template("filter_by_state_and_date.j2")
query = template.render(
state=state,
start_date=start_date,
end_date=end_date
)
df_results = elasticquery(es_url, index_es, query)
df_results["index_of_tweet"] = df_results.index
df_tfidf = tf_idf(df_results["text"].tolist())
df_tfidf["tf_idf_terms"] = df_tfidf.index
df_tfidf = df_tfidf.merge(df_results, on="index_of_tweet")
for i, filter in df_filters.iterrows():
query = template.render(
state=filter["state"],
start_date=filter["start_date"],
end_date=filter["end_date"]
)
df_results = elasticquery(es_url, index_es, query)
df_results["index_of_tweet"] = df_results.index
df_tfidf = tf_idf(df_results["text"].tolist())
df_tfidf["tf_idf_terms"] = df_tfidf.index
df_tfidf = df_tfidf.merge(df_results, on="index_of_tweet")
# Translation to english
model_checkpoint_fr = "Helsinki-NLP/opus-mt-fr-en"
translator_fr = pipeline("translation", model=model_checkpoint_fr)
# zeroshot classification
classifier = pipeline("zero-shot-classification",
model="digitalepidemiologylab/covid-twitter-bert-v2-mnli")
candidate_labels_fr = ["grippe aviaire"]
# candidate_labels_fr = ["covid-19", "grippe aviaire", "AMR", "tiques", "autres"]
candidate_labels_en = ["avian influenza"]
# candidate_labels_en = ["covid-19", "avian influenza", "AMR", "tick borne", "others"]
classifier_results = []
classifier_results_2 = []
for i, tweets in tqdm(df_tfidf.iterrows(), total=df_tfidf.shape[0]):
text = tweets["text"]
# Translation to english
model_checkpoint_fr = "Helsinki-NLP/opus-mt-fr-en"
translator_fr = pipeline("translation", model=model_checkpoint_fr)
# zeroshot classification
classifier = pipeline("zero-shot-classification",
model="digitalepidemiologylab/covid-twitter-bert-v2-mnli")
candidate_labels_fr = ["grippe aviaire"]
# candidate_labels_fr = ["covid-19", "grippe aviaire", "AMR", "tiques", "autres"]
candidate_labels_en = ["avian influenza"]
# candidate_labels_en = ["covid-19", "avian influenza", "AMR", "tick borne", "others"]
classifier_results = []
classifier_results_2 = []
for i, tweets in tqdm(df_tfidf.iterrows(), total=df_tfidf.shape[0]):
text = tweets["text"]
try:
text_translated = text
# text_translated = translator_fr(text)[0]["translation_text"]
classifier_results.append(classifier(text_translated, candidate_labels_fr)["scores"])
item = {"text" : text, "scores" : classifier(text_translated, candidate_labels_fr)["scores"]}
classifier_results_2.append(item)
except:
df_tfidf.drop([i], inplace=True)
print("text: " + text + " | translated: " + text_translated)
classifier_df = pd.DataFrame(classifier_results, columns=candidate_labels_fr)
try:
text_translated = text
# text_translated = translator_fr(text)[0]["translation_text"]
classifier_results.append(classifier(text_translated, candidate_labels_fr)["scores"])
item = {"text" : text, "scores" : classifier(text_translated, candidate_labels_fr)["scores"]}
classifier_results_2.append(item)
f = open("analysis-output/test_2.txt", "w")
for l in classifier_results_2:
f.write(str(l))
f.close()
except:
df_tfidf.drop([i], inplace=True)
print("text: " + text + " | translated: " + text_translated)
classifier_df = pd.DataFrame(classifier_results, columns=candidate_labels_fr)
try:
f = open("analysis-output/test_2.txt", "w")
for l in classifier_results_2:
f.write(str(l))
f.close()
except:
print("can not save file with results from zeroshot")
classifier_df_2 = pd.DataFrame(classifier_results_2)
classifier_df_2.to_csv("analysis-output/acquitaine_test.csv")
df_tfidf = df_tfidf.join(classifier_df)
df_tfidf.to_csv("analysis-output/acquitaine-digitalepidemiologylab.csv")
df_tfidf.to_pickle("analysis-output/acquitaine-digitalepidemiologylab.pkl")
print("can not save file with results from zeroshot")
classifier_df_2 = pd.DataFrame(classifier_results_2)
# classifier_df_2.to_csv("analysis-output/acquitaine_test.csv")
df_tfidf = df_tfidf.join(classifier_df)
df_tfidf.to_csv("analysis-output/" + filter["state"] + "_" + filter["start_date"] + "_digitalepidemiologylab.csv")
# df_tfidf.to_pickle("analysis-output/acquitaine-digitalepidemiologylab.pkl")
# prepare to Gephi for graph vizu: Graph bipartites. Nodes are Newspaper and TF-IDf
news_paper_name = pd.read_csv("./../params/accountsFollowed.csv") # get account (followed by MOOD) names
news_paper_name["retweeted_status.user.id"] = news_paper_name["twitterID"] # prepare for merge
gephi = df_tfidf
# gephi["Source"] = gephi["user.id"].apply(lambda x: hashlib.md5(str(x).encode()).hexdigest()) # pseudonimization
gephi["Source"] = gephi.index # id du lien
gephi["Target"] = gephi["retweeted_status.user.id"]
gephi["Id"] = gephi.index
gephi["Label"] = gephi["tf_idf_terms"]
gephi["timeset"] = gephi["@timestamp"]
gephi = gephi[gephi["Target"].str.len() !=0] # filter out tweet that are not retweeted
gephi[["Id", "Label", "Source", "Target", "timeset"]].to_csv(
"analysis-output/acquitaine_script_gephi_edge.csv",
index=False
)
# gephi.to_csv("analysis-output/gephi-debug.csv")
# Node: newspapers (MOOD account followed)
gephi_node = pd.DataFrame(gephi["retweeted_status.user.id"].unique(), columns=["retweeted_status.user.id"])
gephi_node["Label"] = gephi_node.merge(news_paper_name, on="retweeted_status.user.id")["account"]
gephi_node["Id"] = gephi_node["retweeted_status.user.id"]
# Node: TF-IDF
gephi_node_sub = pd.DataFrame(gephi["tf_idf_terms"].unique(), columns=["tf_idf_terms"])
gephi_node_sub["Id"] = gephi_node_sub.merge(gephi, on="tf_idf_terms")["Id"]
gephi_node_sub = gephi_node_sub.rename(columns={"tf_idf_terms": "Label", "Id": "Id"})
gephi_node = gephi_node.append(gephi_node_sub)
gephi_node.to_csv(
"analysis-output/acquitaine_script_gephi_node.csv",
index=False
)
# prepare to Gephi for graph vizu: Graph bipartites. Nodes are Newspaper and TF-IDf
news_paper_name = pd.read_csv("./../params/accountsFollowed.csv") # get account (followed by MOOD) names
news_paper_name["retweeted_status.user.id"] = news_paper_name["twitterID"] # prepare for merge
gephi = df_tfidf
# gephi["Source"] = gephi["user.id"].apply(lambda x: hashlib.md5(str(x).encode()).hexdigest()) # pseudonimization
gephi["Source"] = gephi.index # id du lien
gephi["Target"] = gephi["retweeted_status.user.id"]
gephi["Id"] = gephi.index
gephi["Label"] = gephi["tf_idf_terms"]
gephi["timeset"] = gephi["@timestamp"]
gephi = gephi[gephi["Target"].str.len() !=0] # filter out tweet that are not retweeted
gephi[["Id", "Label", "Source", "Target", "timeset"]].to_csv(
"analysis-output/acquitaine_script_gephi_edge.csv",
index=False
)
# gephi.to_csv("analysis-output/gephi-debug.csv")
# Node: newspapers (MOOD account followed)
gephi_node = pd.DataFrame(gephi["retweeted_status.user.id"].unique(), columns=["retweeted_status.user.id"])
gephi_node["Label"] = gephi_node.merge(news_paper_name, on="retweeted_status.user.id")["account"]
gephi_node["Id"] = gephi_node["retweeted_status.user.id"]
# Node: TF-IDF
gephi_node_sub = pd.DataFrame(gephi["tf_idf_terms"].unique(), columns=["tf_idf_terms"])
gephi_node_sub["Id"] = gephi_node_sub.merge(gephi, on="tf_idf_terms")["Id"]
gephi_node_sub = gephi_node_sub.rename(columns={"tf_idf_terms": "Label", "Id": "Id"})
gephi_node = gephi_node.append(gephi_node_sub)
gephi_node.to_csv(
"analysis-output/acquitaine_script_gephi_node.csv",
index=False
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment