Commit ab95d7ca authored by Decoupes Remy's avatar Decoupes Remy
Browse files

get_tweet_content: retrieves result

parent cca3ebf5
......@@ -16,6 +16,7 @@ import pandas as pd
import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots
from collections import defaultdict
def logsetup():
"""
......@@ -157,6 +158,22 @@ def time_series_by_disease_keywords(jinja_env, es_url, index_es, list_of_keyword
logger.debug(df_all_kw_timeserie)
return df_all_kw_timeserie
def get_tweet_content_by_disease(jinja_env, es_url, index_es, list_of_keywords, disease, nb_of_estimated_results=10000):
template = jinja_env.get_template("get_tweets_content_by_keywords.json.j2")
query = template.render(list_of_keywords=list_of_keywords)
headers = {'content-type': 'application/json'}
try:
r = requests.get(es_url + index_es + "/_search?size=" + str(nb_of_estimated_results), data=query, headers=headers)
except Exception as e:
logger.error("Time series: doesn't work. See the full error: ")
return -1
list_of_tweets = []
for hit in r.json()["hits"]["hits"]:
list_of_tweets.append(hit["fields"])
df_results = pd.DataFrame(list_of_tweets)
# df_results.to_pickle("/home/rdecoupe/Téléchargements/test/get_tweet_content_by_disease.pkl")
return df_results
if __name__ == '__main__':
logger = logsetup()
logger.info("EDA start")
......@@ -170,11 +187,15 @@ if __name__ == '__main__':
jinja_env = Environment(loader=FileSystemLoader(template_dir))
# end of init
# Count RT and no RT
"""
Count RT and no RT
"""
# nb_rt_query = count_rt(jinja_env, es_url, index_es)
# print(nb_rt_query)
# Count tweets by disease
"""
Count tweets by disease
"""
path_param_keywords = "/home/rdecoupe/PycharmProjects/mood-tweets-collect/params/keywordsFilter.csv"
params_kw = pd.read_csv(path_param_keywords)
logger.info("Count tweets by disease: browse syndrome")
......@@ -211,12 +232,9 @@ if __name__ == '__main__':
# Si besoin d'affichage HTML :
# treemap_without_covid_fig3.show()
# Get Account's tweets :
# path_param_accounts = "/home/rdecoupe/PycharmProjects/mood-tweets-collect/params/accountsFollowed.csv"
# params_accounts = pd.read_csv(path_param_accounts)
# logger.info("Get account's tweet")
# Time series of keywords (except covid)
"""
Time series of keywords (except covid)
"""
path_param_keywords = "/home/rdecoupe/PycharmProjects/mood-tweets-collect/params/keywordsFilter.csv"
params_kw = pd.read_csv(path_param_keywords)
logger.info("time series by disease")
......@@ -250,6 +268,18 @@ if __name__ == '__main__':
"""
for trace in range(len(timeserie_fig["data"])):
subplots_timeseries_fig.add_trace(timeserie_fig["data"][trace], row=current_row, col=1)
subplots_timeseries_fig.show()
# subplots_timeseries_fig.show()
"""
Clustering tweets for a disease name
"""
path_param_keywords = "/home/rdecoupe/PycharmProjects/mood-tweets-collect/params/keywordsFilter.csv"
params_kw = pd.read_csv(path_param_keywords)
logger.info("time series by disease")
# For Avian Influenza:
disease = "Avian influenza"
list_of_keywords = ['Fowl', 'Bird', 'Avian', 'HPAI', 'FowlPlague', 'AvianInfluenza', 'avianInfluenza',
'Avianflu', 'bird', 'BirdFlu']
corpus_tweets = get_tweet_content_by_disease(jinja_env, es_url, index_es, list_of_keywords, disease)
logger.info("EDA stop")
{
"fields": [
"text",
"extended_tweet.full_text",
{
"field": "@timestamp",
"format": "strict_date_optional_time"
}
],
"_source": false,
"query": {
"bool": {
"must": [],
"filter": [
{
"bool": {
"should": [
{% for kw in list_of_keywords %}
{
"bool": {
"should": [
{
"match": {
"text": "{{ kw }}"
}
}
],
"minimum_should_match": 1
}
}{{ ", " if not loop.last else "" }}
{% endfor %}
],
"minimum_should_match": 1
}
},
{
"range": {
"@timestamp": {
"format": "strict_date_optional_time",
"gte": "2019-02-01T14:48:58.660Z",
"lte": "2022-02-01T14:48:58.660Z"
}
}
}
],
"should": [],
"must_not": []
}
}
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment