Commit fbdbbacc authored by Decoupes Remy's avatar Decoupes Remy
Browse files

visualize sentence embedding: add location information (country)

parent fcc41b98
......@@ -185,16 +185,23 @@ def get_tweet_content_by_disease(jinja_env, es_url, index_es, list_of_keywords,
text = hit["fields"]["extended_tweet.full_text"][0]
except:# else we take only the 140 characters
text = hit["fields"]["text"][0]
try:# test is we have a country information
country = hit["fields"]["rest.features.properties.country"][0]
except:# else we take only the 140 characters
country = "none"
tweet = {
"timestamp": hit["fields"]["@timestamp"][0],
"text": text
"text": text,
"country": country
}
list_of_tweets.append(tweet)
df_results = pd.DataFrame(list_of_tweets)
# df_results.to_pickle("/home/rdecoupe/Téléchargements/test/get_tweet_content_by_disease.pkl")
return df_results
def visualize_sentence_through_embedding(corpus):
def visualize_sentence_through_embedding(df_corpus):
corpus_tweets_list = df_corpus.text.values.tolist()
corpus_tweets_list = list(set(corpus_tweets_list)) #Remove duplicate tweets (mostly RT)
# How to choose the model: An overview on huggingface
# https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models
multi_lingual_model = "distiluse-base-multilingual-cased-v1"
......@@ -202,18 +209,19 @@ def visualize_sentence_through_embedding(corpus):
faster_model = "all-MiniLM-L6-v2" # used by UKPLab : https://github.com/UKPLab/sentence-transformers/blob/a94030226da1f4b03f9d703596b0ebd360c9ef43/examples/applications/clustering/agglomerative.py#L33
embedder = SentenceTransformer(faster_model)
# Encode !
corpus_embeddings = embedder.encode(corpus)
corpus_embeddings = embedder.encode(corpus_tweets_list)
# Normalize the embeddings to unit length
corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)
# Dimension reduction with t-SNE
tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
corpus_embeddings_tsne = tsne_model.fit_transform(corpus_embeddings)
corpus_embeddings_tsne_df = pd.DataFrame(corpus_embeddings_tsne)
corpus_embeddings_tsne_df["label"] = corpus
corpus_embeddings_tsne_df["label"] = corpus_tweets_list
corpus_embeddings_tsne_df["country"] = df_corpus.country
# plot with plotly express
fig = px.scatter(
corpus_embeddings_tsne_df, x=0, y=1,
hover_data=["label"]
hover_data=["label"], color="country"
)
fig.show()
......@@ -325,8 +333,6 @@ if __name__ == '__main__':
list_of_keywords = ['Fowl', 'Bird', 'Avian', 'HPAI', 'FowlPlague', 'AvianInfluenza', 'avianInfluenza',
'Avianflu', 'bird', 'BirdFlu']
corpus_tweets = get_tweet_content_by_disease(jinja_env, es_url, index_es, list_of_keywords, disease)
corpus_tweets_list = corpus_tweets.text.values.tolist()
corpus_tweets_list = list(set(corpus_tweets_list)) #Remove duplicate tweets (mostly RT)
visualize_sentence_through_embedding(corpus_tweets_list)
visualize_sentence_through_embedding(corpus_tweets)
logger.info("EDA stop")
......@@ -2,6 +2,7 @@
"fields": [
"text",
"extended_tweet.full_text",
"rest.features.properties.country",
{
"field": "@timestamp",
"format": "strict_date_optional_time"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment