Commit 721b23c8 authored by Decoupes Remy's avatar Decoupes Remy
Browse files

Fix PEP8 warning

parent e6f6d979
......@@ -5,7 +5,6 @@
Explore Data Analysis of tweets indexed in ElasticSearch
"""
import json
import logging
from logging.handlers import RotatingFileHandler
from elasticsearch import Elasticsearch
......@@ -14,7 +13,6 @@ from jinja2 import FileSystemLoader, Environment
import os
import requests
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
......@@ -29,7 +27,8 @@ def logsetup():
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s :: %(levelname)s :: %(message)s')
file_handler = RotatingFileHandler('/home/rdecoupe/PycharmProjects/mood-tweets-collect/elasticsearch/log/eda/eda.log', 'a', 1000000, 1)
file_handler = RotatingFileHandler(
'/home/rdecoupe/PycharmProjects/mood-tweets-collect/elasticsearch/log/eda/eda.log', 'a', 1000000, 1)
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
......@@ -39,6 +38,7 @@ def logsetup():
logger.addHandler(stream_handler)
return logger
def count_rt(jinja_env, es_url, index_es):
"""
Count the number of RT of the whole corpus
......@@ -54,11 +54,12 @@ def count_rt(jinja_env, es_url, index_es):
try:
r = requests.get(es_url + index_es + "/_search", data=query, headers=headers)
except Exception as e:
logger.error("Count_RT: doesn't work. See the full error: " + print(str(e)))
logger.error("Count_RT: doesn't work")
return -1
nb_RT = r.json()['aggregations']['0-bucket']['doc_count']
return nb_RT
def count_tweets_by_disease_keywords(jinja_env, es_url, index_es, list_of_keywords, disease):
"""
for each keyword: get the nb of tweets containing this keyword
......@@ -76,18 +77,19 @@ def count_tweets_by_disease_keywords(jinja_env, es_url, index_es, list_of_keywor
try:
r = requests.get(es_url + index_es + "/_search", data=query, headers=headers)
except Exception as e:
logger.error("Count_RT: doesn't work. See the full error: " + print(str(e)))
logger.error("Count_RT: doesn't work. See the full error: ")
return -1
df_results = pd.DataFrame.from_dict(r.json()["aggregations"]["0"]["buckets"])
# clean up label from elasticsearch (because it contains "text: [kw]")
df_results.rename(columns=lambda x: x.split(' : ')[1], inplace=True)
# transpose the dataframe
df_results = df_results.T
# add the disase name
# add the disease name
df_results["disease"] = disease
logger.debug(df_results)
return df_results
if __name__ == '__main__':
logger = logsetup()
logger.info("EDA start")
......@@ -123,16 +125,23 @@ if __name__ == '__main__':
df_kw_by_disease = df_kw_by_disease.append(df)
df_kw_by_disease_without_covid = df_kw_by_disease[df_kw_by_disease["disease"] != "SARS-CoV-2 "]
# sunburst with plotly express
pie_fig = px.sunburst(df_kw_by_disease, path=['disease', df_kw_by_disease.index], values='doc_count', color='disease')
pie_fig = px.sunburst(df_kw_by_disease,
path=['disease', df_kw_by_disease.index], values='doc_count', color='disease')
pio.write_image(pie_fig, path_figs_dir + "/count_tweets_by_disease_keywords_sunburst_all.png", format='png')
pie_fig = px.sunburst(df_kw_by_disease_without_covid, path=['disease', df_kw_by_disease_without_covid.index], values='doc_count', color='disease')
pio.write_image(pie_fig, path_figs_dir + "/count_tweets_by_disease_keywords_sunburst_without_covid.png", format='png')
pie_fig = px.sunburst(df_kw_by_disease_without_covid,
path=['disease', df_kw_by_disease_without_covid.index], values='doc_count', color='disease')
pio.write_image(pie_fig, path_figs_dir + "/count_tweets_by_disease_keywords_sunburst_without_covid.png",
format='png')
# treemap
treemap_fig3 = px.treemap(df_kw_by_disease, path=['disease', df_kw_by_disease.index], values='doc_count', color='disease')
pio.write_image(treemap_fig3,path_figs_dir + "/count_tweets_by_disease_keywords_treemap_all.png", format='png')
treemap_without_covid_fig3 = px.treemap(df_kw_by_disease_without_covid, path=['disease', df_kw_by_disease_without_covid.index], values='doc_count',
color='disease')
pio.write_image(treemap_without_covid_fig3, path_figs_dir + "/count_tweets_by_disease_keywords_treemap_without_covid.png", format='png')
treemap_fig3 = px.treemap(df_kw_by_disease,
path=['disease', df_kw_by_disease.index], values='doc_count', color='disease')
pio.write_image(treemap_fig3, path_figs_dir + "/count_tweets_by_disease_keywords_treemap_all.png", format='png')
treemap_without_covid_fig3 = px.treemap(df_kw_by_disease_without_covid,
path=['disease', df_kw_by_disease_without_covid.index], values='doc_count',
color='disease')
pio.write_image(treemap_without_covid_fig3,
path_figs_dir + "/count_tweets_by_disease_keywords_treemap_without_covid")
# Si besoin d'affichage HTML :
treemap_without_covid_fig3.show()
logger.info("EDA stop")
\ No newline at end of file
logger.info("EDA stop")
......@@ -12,3 +12,33 @@ requests-oauthlib==1.3.0
six==1.15.0
tweepy==3.8.0
urllib3==1.25.9
certifi==2020.4.5.2
chardet==3.0.4
cycler==0.11.0
elasticsearch==7.16.3
fonttools==4.28.5
idna==2.9
Jinja2==3.0.3
kaleido==0.2.1
kiwisolver==1.3.2
logger==1.4
MarkupSafe==2.0.1
matplotlib==3.5.1
numpy==1.18.5
oauthlib==3.1.0
packaging==21.3
pandas==1.0.4
Pillow==9.0.0
pip-licenses==2.2.0
plotly==5.5.0
PTable==0.9.2
pyparsing==3.0.6
PySocks==1.7.1
python-dateutil==2.8.1
pytz==2020.1
requests==2.23.0
requests-oauthlib==1.3.0
six==1.15.0
tenacity==8.0.1
tweepy==3.8.0
urllib3==1.25.9
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment