Commit 0fd684c8 authored by Decoupes Remy's avatar Decoupes Remy
Browse files

gephi visualization: on newspaper twitter account name

parent d4dacf45
......@@ -19,3 +19,4 @@ elasticsearch/log/
elasticsearch/logstash/sincedb.log
elasticsearch/logfix_bad_quote_json.log
elasticsearch/src/figs/
/elasticsearch/analysis-output/gephi_on_acquitaine.gephi
......@@ -5,6 +5,8 @@
Download metric from Elasticsearch to easily compute TF-IDF matric
"""
import hashlib
from elasticsearch import Elasticsearch
from jinja2 import FileSystemLoader, Environment
import os
......@@ -139,26 +141,31 @@ if __name__ == '__main__':
df_tfidf = df_tfidf.merge(df_results, on="index_of_tweet")
# prepare to Gephi for graph vizu
gephi = df_results
gephi["Source"] = gephi["user.name"]
gephi["Source"] = gephi["user.name"].apply(lambda x: hashlib.md5(x.encode()).hexdigest()) # pseudonimization
gephi["Target"] = gephi["retweeted_status.user.name"]
gephi["ID"] = gephi.index
gephi["Timestamp"] = gephi["@timestamp"]
gephi["label"] = gephi["retweeted_status.user.name"]
gephi["timeset"] = gephi["@timestamp"]
gephi = gephi[gephi["Target"].str.len() !=0] # filter out tweet that are not retweeted
gephi[["ID", "label", "Source", "Target", "Timestamp"]].to_csv(
"/home/rdecoupe/Téléchargements/acquitaine_script_gephi.csv",
gephi[["ID", "Source", "Target", "Timestamp"]].to_csv(
"/home/rdecoupe/Téléchargements/acquitaine_script_gephi_edge.csv",
index=False
)
gephi_node = pd.DataFrame(gephi["Target"].unique(), columns=["Label"])
gephi_node["ID"] = gephi_node["Label"]
gephi_node.to_csv(
"/home/rdecoupe/Téléchargements/acquitaine_script_gephi_node.csv",
index=False
)
# zeroshot classification
classifier = pipeline("zero-shot-classification",
model="BaptisteDoyen/camembert-base-xnli")
candidate_labels = ["covid-19", "grippe aviaire", "AMR", "tiques", "autres"]
classifier_results = []
for i, tweets in tqdm(df_tfidf.iterrows()):
classifier_results.append(classifier(tweets["text"], candidate_labels)["scores"])
print(classifier_results)
df_tfidf[[candidate_labels]] = pd.DataFrame(classifier_results)
df_tfidf.to_csv("/home/rdecoupe/Téléchargements/acquitaine4.csv")
# # zeroshot classification
# classifier = pipeline("zero-shot-classification",
# model="BaptisteDoyen/camembert-base-xnli")
# candidate_labels = ["covid-19", "grippe aviaire", "AMR", "tiques", "autres"]
# classifier_results = []
# for i, tweets in tqdm(df_tfidf.iterrows()):
# classifier_results.append(classifier(tweets["text"], candidate_labels)["scores"])
# print(classifier_results)
# df_tfidf[[candidate_labels]] = pd.DataFrame(classifier_results)
# df_tfidf.to_csv("/home/rdecoupe/Téléchargements/acquitaine4.csv")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment