Commit 85a03e44 authored by Decoupes Remy's avatar Decoupes Remy
Browse files

sentence embedding: compare with a multi-lingual model

parent 65b5a19d
......@@ -199,7 +199,7 @@ def get_tweet_content_by_disease(jinja_env, es_url, index_es, list_of_keywords,
# df_results.to_pickle("/home/rdecoupe/Téléchargements/test/get_tweet_content_by_disease.pkl")
return df_results
def visualize_sentence_through_embedding(df_corpus):
def visualize_sentence_through_embedding(df_corpus, multi_langue=False):
corpus_tweets_list = df_corpus.text.values.tolist()
corpus_tweets_list = list(set(corpus_tweets_list)) #Remove duplicate tweets (mostly RT)
# How to choose the model: An overview on huggingface
......@@ -207,7 +207,10 @@ def visualize_sentence_through_embedding(df_corpus):
multi_lingual_model = "distiluse-base-multilingual-cased-v1"
best_quality_model = "all-mpnet-base-v2"
faster_model = "all-MiniLM-L6-v2" # used by UKPLab : https://github.com/UKPLab/sentence-transformers/blob/a94030226da1f4b03f9d703596b0ebd360c9ef43/examples/applications/clustering/agglomerative.py#L33
embedder = SentenceTransformer(faster_model)
if multi_langue:
embedder = SentenceTransformer(multi_lingual_model)
else:
embedder = SentenceTransformer(faster_model)
# Encode !
corpus_embeddings = embedder.encode(corpus_tweets_list)
# Normalize the embeddings to unit length
......@@ -340,4 +343,5 @@ if __name__ == '__main__':
'H5N2', 'H7N7', 'H9N2', 'H1N2', 'H7N1', 'H7N2', 'H7N3', 'H10N7', 'H7N9', 'H10N8', 'H5N8']
corpus_tweets = get_tweet_content_by_disease(jinja_env, es_url, index_es, list_of_keywords, disease)
visualize_sentence_through_embedding(corpus_tweets)
visualize_sentence_through_embedding(corpus_tweets, multi_langue=True)
logger.info("EDA stop")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment