Commit 410a9dbe authored by Rémy Decoupes's avatar Rémy Decoupes
Browse files

zero shot classif w/ translation into english

parent 6598c331
......@@ -15,6 +15,7 @@ __pycache__*
.idea/
#### elastic data & logs ####
elasticsearch/data/
elasticsearch/analysis-output/
elasticsearch/log/
elasticsearch/logstash/sincedb.log
elasticsearch/logfix_bad_quote_json.log
......
......@@ -146,19 +146,26 @@ if __name__ == '__main__':
gephi["label"] = gephi["retweeted_status.user.name"]
gephi = gephi[gephi["Target"].str.len() !=0] # filter out tweet that are not retweeted
gephi[["ID", "label", "Source", "Target", "Timestamp"]].to_csv(
"/home/rdecoupe/Téléchargements/acquitaine_script_gephi.csv",
"analysis-output/acquitaine_script_gephi.csv",
index=False
)
# Translation to english
model_checkpoint_fr = "Helsinki-NLP/opus-mt-fr-en"
translator_fr = pipeline("translation", model=model_checkpoint_fr)
# zeroshot classification
classifier = pipeline("zero-shot-classification",
model="BaptisteDoyen/camembert-base-xnli")
candidate_labels = ["covid-19", "grippe aviaire", "AMR", "tiques", "autres"]
model="digitalepidemiologylab/covid-twitter-bert-v2-mnli")
candidate_labels_fr = ["covid-19", "grippe aviaire", "AMR", "tiques", "autres"]
candidate_labels_en = ["covid-19", "avian influenza", "AMR", "tick borne", "others"]
classifier_results = []
for i, tweets in df_tfidf.iterrows():
classifier_results.append(classifier(tweets["text"], candidate_labels))
print(classifier_results)
df_tfidf[[candidate_labels]] = pd.DataFrame(classifier_results)
df_tfidf.to_csv("/home/rdecoupe/Téléchargements/acquitaine4.csv")
for i, tweets in tqdm(df_tfidf.iterrows()):
text = tweets["text"]
text_translated = translator_fr(text)[0]["translation_text"]
classifier_results.append(classifier(text_translated, candidate_labels_en)["scores"])
classifier_df = pd.DataFrame(classifier_results, columns=candidate_labels_en)
df_tfidf = df_tfidf.join(classifier_df)
df_tfidf.to_csv("analysis-output/acquitaine-digitalepidemiologylab.csv")
df_tfidf.to_pickle("analysis-output/acquitaine-digitalepidemiologylab.pkl")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment