Commit 13183a90 authored by Decoupes Remy's avatar Decoupes Remy
Browse files

Get timeseries of aggregated tweets by pairs of disease and keywords

parent 7dd6a8aa
......@@ -89,6 +89,71 @@ def count_tweets_by_disease_keywords(jinja_env, es_url, index_es, list_of_keywor
logger.debug(df_results)
return df_results
def time_series_by_disease_keywords(jinja_env, es_url, index_es, list_of_keywords, disease):
"""
Return dataframe :
- index : timestamp : date + time
- Columns : each pair of disease:keyword :
Respiratory:lungdisease ... SARS-CoV-2 :SARS-CoV-2
2021-05-03T00:00:00.000+02:00 54.0 ... 3225
2021-06-02T00:00:00.000+02:00 127.0 ... 1979
2021-07-02T00:00:00.000+02:00 3.0 ... 2988
To do so, we have to work on the formating of Elasticsearch result
:param jinja_env: Dir to jinja templates
:param es_url: URL of Elastic Search
:param index_es: Index of elastic
:param list_of_keywords: a list of keywords to run the elastic query
:param disease: name of the disease
:return: a dataframe: for each keyword: get the nb of tweets containing this keyword
"""
template = jinja_env.get_template("disease_time_series.json.j2")
query = template.render(list_of_keywords=list_of_keywords)
headers = {'content-type': 'application/json'}
try:
r = requests.get(es_url + index_es + "/_search", data=query, headers=headers)
except Exception as e:
logger.error("Time series: doesn't work. See the full error: ")
return -1
df_results = pd.DataFrame.from_dict(r.json()["aggregations"]["0"]["buckets"])
# clean up label from elasticsearch (because it contains "text: [kw]")
df_results.rename(columns=lambda x: x.split(' : ')[1], inplace=True)
# transpose the dataframe
df_results = df_results.T
# add the disease name
df_results["disease"] = disease
# reformat time serie column
df_results["time_serie"] = df_results.apply(lambda x: x["time_serie"]["buckets"], axis=1)
"""
build a brand new dataframe with
index = date
column = disease:keywords
"""
df_all_kw_timeserie_empty = True
for keyword in df_results.index:
kw_time_serie = []
df_kw = df_results.loc[keyword]
if df_kw['doc_count'] != 0: # avoid empty time serie
for element in df_kw['time_serie']:
timestamp = element['key_as_string']
value = element['doc_count']
kw_time_serie.append([timestamp, value])
df_kw_timeserie = pd.DataFrame(kw_time_serie, columns=['timestamp', disease+':'+keyword])
df_kw_timeserie.set_index('timestamp', inplace=True)
if df_all_kw_timeserie_empty == True:
df_all_kw_timeserie = df_kw_timeserie
df_all_kw_timeserie_empty = False
else: #merge column
# df_all_kw_timeserie[disease+':'+keyword] = df_kw_timeserie[disease+':'+keyword]
df_all_kw_timeserie = pd.concat([df_all_kw_timeserie, df_kw_timeserie], axis=1)
if df_all_kw_timeserie_empty == True:# none of keywords have a time serie
return "empty_series"
else:
logger.debug(df_all_kw_timeserie)
return df_all_kw_timeserie
if __name__ == '__main__':
logger = logsetup()
......@@ -142,9 +207,33 @@ if __name__ == '__main__':
pio.write_image(treemap_without_covid_fig3,
path_figs_dir + "/count_tweets_by_disease_keywords_treemap_without_covid.png", format='png')
# Si besoin d'affichage HTML :
treemap_without_covid_fig3.show()
# treemap_without_covid_fig3.show()
# Get Account's tweets :
# path_param_accounts = "/home/rdecoupe/PycharmProjects/mood-tweets-collect/params/accountsFollowed.csv"
# params_accounts = pd.read_csv(path_param_accounts)
# logger.info("Get account's tweet")
# Time series of keywords (except covid)
path_param_keywords = "/home/rdecoupe/PycharmProjects/mood-tweets-collect/params/keywordsFilter.csv"
params_kw = pd.read_csv(path_param_keywords)
logger.info("time series by disease")
df_kw_by_disease = ""
for disease in params_kw['syndrome'].unique():
list_of_keywords = params_kw[params_kw['syndrome'] == disease]["hashtags"].tolist()
# remove duplicate in keywords:
list_of_keywords = list(set(list_of_keywords))
logger.debug("\t" + str(disease) + ": list of keywords: " + str(list_of_keywords))
if list_of_keywords != []:
df = time_series_by_disease_keywords(jinja_env, es_url, index_es, list_of_keywords, str(disease))
if type(df_kw_by_disease) == str:
df_kw_by_disease = df
else:
if type(df) != str: # For disease that they have empy time serie for each keywords: we don't plot them
df_kw_by_disease = pd.concat([df_kw_by_disease, df], axis=1)
# df_kw_by_disease_without_covid = df_kw_by_disease[df_kw_by_disease["disease"] != "SARS-CoV-2 "]
logger.debug(df_kw_by_disease.keys())
timeseries_fig = px.bar(df_kw_by_disease)
timeseries_fig.show()
logger.info("EDA stop")
{
"aggs": {
"0": {
"filters": {
"filters": {
{% for kw in list_of_keywords %}
"text : {{ kw }}": {
"bool": {
"must": [],
"filter": [
{
"bool": {
"should": [
{
"match": {
"text": "{{ kw }}"
}
}
],
"minimum_should_match": 1
}
}
],
"should": [],
"must_not": []
}
}{{ ", " if not loop.last else "" }}
{% endfor %}
}
},
"aggs": {
"time_serie": {
"date_histogram": {
"field": "@timestamp",
"fixed_interval": "30d",
"time_zone": "Europe/Paris"
}
}
}
}
},
"size": 0,
"fields": [
{
"field": "@timestamp",
"format": "date_time"
}
],
"script_fields": {},
"stored_fields": [
"*"
],
"runtime_mappings": {},
"_source": {
"excludes": []
}
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment