Commit 1c714535 authored by Decoupes Remy's avatar Decoupes Remy
Browse files

filter also by temporal period and improve output

parent 2eae195b
......@@ -31,8 +31,8 @@
"range": {
"@timestamp": {
"format": "strict_date_optional_time",
"gte": "2019-03-25T15:51:46.172Z",
"lte": "2022-03-25T15:51:46.172Z"
"gte": "{{ start_date }}",
"lte": "{{ end_date }}"
}
}
}
......
......@@ -97,10 +97,19 @@ def tf_idf(list_of_docs, lang='english', nb_top_score=1000):
columns=listOfTerms
)
# sort value by TF-IDF score
tf_idf_top_score = df.max().nlargest(nb_top_score)
# df.to_pickle('/home/rdecoupe/Téléchargements/acquitaine.pkl')
tf_idf_top_score = pd.DataFrame()
tf_idf_top_score["index_of_tweet"] = df.idxmax() # keep a track from tweet ID
tf_idf_top_score["tfidf_score"] = df.max().values # retrieve TF-IDF score for the term in document
tf_idf_top_score = tf_idf_top_score.sort_values(by="tfidf_score", ascending=False) # sort values
return tf_idf_top_score
if __name__ == '__main__':
# Filter
state = 'aquitaine'
start_date = "2020-12-01T00:00:00.172Z"
end_date = "2022-02-25T15:51:46.172Z"
# connect to elastic
""" Why not using eland ?
# we could not filter eland df with rest.features.properties.state certainly because there are to much fields ?
# Two solutions :
......@@ -118,7 +127,11 @@ if __name__ == '__main__':
template_dir = os.path.join(os.path.dirname(__file__), "eda_templates")
jinja_env = Environment(loader=FileSystemLoader(template_dir))
template = jinja_env.get_template("filter_by_state_and_date.j2")
query = template.render(state="aquitaine")
query = template.render(
state=state,
start_date=start_date,
end_date=end_date
)
df_results = elasticquery(es_url, index_es, query)
df_tfidf = tf_idf(df_results["text"].tolist())
df_tfidf.to_csv("/home/rdecoupe/Téléchargements/yip3.csv")
df_tfidf.to_csv("/home/rdecoupe/Téléchargements/acquitaine2.csv")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment