Commit 878a75fc authored by Decoupes Remy's avatar Decoupes Remy
Browse files

enrich TF-IDF terms by their tweet context

parent 1c714535
......@@ -97,7 +97,6 @@ def tf_idf(list_of_docs, lang='english', nb_top_score=1000):
columns=listOfTerms
)
# sort value by TF-IDF score
# df.to_pickle('/home/rdecoupe/Téléchargements/acquitaine.pkl')
tf_idf_top_score = pd.DataFrame()
tf_idf_top_score["index_of_tweet"] = df.idxmax() # keep a track from tweet ID
tf_idf_top_score["tfidf_score"] = df.max().values # retrieve TF-IDF score for the term in document
......@@ -133,5 +132,8 @@ if __name__ == '__main__':
end_date=end_date
)
df_results = elasticquery(es_url, index_es, query)
df_results["index_of_tweet"] = df_results.index
df_tfidf = tf_idf(df_results["text"].tolist())
df_tfidf.to_csv("/home/rdecoupe/Téléchargements/acquitaine2.csv")
df_tfidf["tf_idf_terms"] = df_tfidf.index
df_tfidf = df_tfidf.merge(df_results, on="index_of_tweet")
df_tfidf.to_csv("/home/rdecoupe/Téléchargements/acquitaine3.csv")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment