Commit 6598c331 authored by Decoupes Remy's avatar Decoupes Remy
Browse files

try a french model for zeo shot classification

parent 851fcdbd
......@@ -14,6 +14,7 @@ import pandas as pd
from tqdm import tqdm
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
def elastic_pagination_scrolling(result, headers):
"""
......@@ -136,19 +137,7 @@ if __name__ == '__main__':
df_tfidf = tf_idf(df_results["text"].tolist())
df_tfidf["tf_idf_terms"] = df_tfidf.index
df_tfidf = df_tfidf.merge(df_results, on="index_of_tweet")
df_tfidf.to_csv("/home/rdecoupe/Téléchargements/acquitaine3.csv")
# prepare to Gephi for graph vizu
# gephi = df_results
# gephi["Source"] = gephi["id"]
# gephi["Target"] = gephi["retweeted_status.id"]
# gephi["ID"] = gephi.index
# gephi["Timestamp"] = gephi["@timestamp"]
# gephi["label"] = gephi["retweeted_status.user.name"]
# gephi = gephi[gephi["Target"].str.len() !=0] # filter out tweet that are not retweeted
# gephi[["ID", "label", "Source", "Target", "Timestamp"]].to_csv(
# "/home/rdecoupe/Téléchargements/acquitaine_script_gephi.csv",
# index = False
# )
gephi = df_results
gephi["Source"] = gephi["user.name"]
gephi["Target"] = gephi["retweeted_status.user.name"]
......@@ -160,3 +149,17 @@ if __name__ == '__main__':
"/home/rdecoupe/Téléchargements/acquitaine_script_gephi.csv",
index=False
)
# zeroshot classification
classifier = pipeline("zero-shot-classification",
model="BaptisteDoyen/camembert-base-xnli")
candidate_labels = ["covid-19", "grippe aviaire", "AMR", "tiques", "autres"]
classifier_results = []
for i, tweets in df_tfidf.iterrows():
classifier_results.append(classifier(tweets["text"], candidate_labels))
print(classifier_results)
df_tfidf[[candidate_labels]] = pd.DataFrame(classifier_results)
df_tfidf.to_csv("/home/rdecoupe/Téléchargements/acquitaine4.csv")
absl-py==1.0.0
astunparse==1.6.3
attrs==21.4.0
cached-property==1.5.2
cachetools==5.0.0
certifi==2020.4.5.2
chardet==3.0.4
idna==2.9
numpy==1.18.5
oauthlib==3.1.0
pandas==1.0.4
PySocks==1.7.1
python-dateutil==2.8.1
pytz==2020.1
requests==2.23.0
requests-oauthlib==1.3.0
six==1.15.0
tweepy==3.8.0
urllib3==1.25.9
certifi==2020.4.5.2
chardet==3.0.4
click==8.0.3
click-plugins==1.1.1
cligj==0.7.2
cycler==0.11.0
elasticsearch==7.16.3
eland==8.0.0
elastic-transport==8.0.1
elasticsearch==8.1.1
et-xmlfile==1.1.0
filelock==3.4.2
Fiona==1.8.21
flatbuffers==2.0
fonttools==4.28.5
gast==0.5.3
geopandas==0.10.2
google-auth==2.6.2
google-auth-oauthlib==0.4.6
google-pasta==0.2.0
grpcio==1.44.0
h5py==3.6.0
huggingface-hub==0.4.0
idna==2.9
importlib-metadata==4.10.1
Jinja2==3.0.3
joblib==1.1.0
kaleido==0.2.1
keras==2.8.0
Keras-Preprocessing==1.1.2
kiwisolver==1.3.2
libclang==13.0.0
logger==1.4
Markdown==3.3.6
MarkupSafe==2.0.1
matplotlib==3.5.1
numpy==1.18.5
munch==2.5.0
nltk==3.6.7
numpy==1.21.5
oauthlib==3.1.0
openpyxl==3.0.9
opt-einsum==3.3.0
packaging==21.3
pandas==1.0.4
pandas==1.3.5
Pillow==9.0.0
pip-licenses==2.2.0
plotly==5.5.0
protobuf==3.19.4
PTable==0.9.2
pyasn1==0.4.8
pyasn1-modules==0.2.8
pyparsing==3.0.6
pyproj==3.2.1
PySocks==1.7.1
python-dateutil==2.8.1
pytz==2020.1
PyYAML==6.0
regex==2022.1.18
requests==2.23.0
requests-oauthlib==1.3.0
rsa==4.8
sacremoses==0.0.47
scikit-learn==1.0.2
scipy==1.7.3
sentence-transformers==2.1.0
sentencepiece==0.1.96
Shapely==1.8.0
six==1.15.0
tenacity==8.0.1
tensorboard==2.8.0
tensorboard-data-server==0.6.1
tensorboard-plugin-wit==1.8.1
tensorflow==2.8.0
tensorflow-io-gcs-filesystem==0.24.0
termcolor==1.1.0
tf-estimator-nightly==2.8.0.dev2021122109
threadpoolctl==3.1.0
tokenizers==0.11.4
torch==1.10.2
torchvision==0.11.3
tqdm==4.62.3
transformers==4.17.0
tweepy==3.8.0
urllib3==1.25.9
typing_extensions==4.0.1
urllib3==1.25.11
Werkzeug==2.1.0
wrapt==1.14.0
xlrd==2.0.1
zipp==3.7.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment