Commit 54665db9 authored by Decoupes Remy's avatar Decoupes Remy
Browse files

compute tf-idf and extract the top ranked terms

parent 05200669
......@@ -13,7 +13,7 @@ import requests
import pandas as pd
from tqdm import tqdm
import json
import math
from sklearn.feature_extraction.text import TfidfVectorizer
def elastic_pagination_scrolling(result, headers):
"""
......@@ -42,6 +42,7 @@ def elastic_pagination_scrolling(result, headers):
results += res['hits']['hits']
scroll_size = len(res['hits']['hits'])
pbar.update(scroll_size)
print(scroll_size)
except:
pbar.close()
break
......@@ -63,7 +64,7 @@ def elasticquery(es_url, index_es, query):
headers=headers,
)
except Exception as e:
print("Can not query: "+str(querys))
print("Can not query: "+str(query))
results = elastic_pagination_scrolling(r.json(), headers)
df_results = pd.DataFrame(results)
""" Formating elasticsearch respons
......@@ -77,6 +78,25 @@ def elasticquery(es_url, index_es, query):
df_results = df_results.applymap(lambda x: x[0] if isinstance(x, list) else '')
return df_results
def tf_idf(list_of_docs, lang='english', nb_top_score=1000):
try:
vectorizer = TfidfVectorizer(
stop_words=lang,
max_features=250000,
token_pattern="[A-zÀ-ÿ0-9#@]+"
)
vectors = vectorizer.fit_transform(list_of_docs)
except:
print("tf-idf failled")
listOfTerms = vectorizer.get_feature_names_out()
countOfTerms = vectors.todense().tolist()
df = pd.DataFrame(
countOfTerms,
columns=listOfTerms
)
# sort value by TF-IDF score
tf_idf_top_score = df.max().nlargest(nb_top_score)
return tf_idf_top_score
if __name__ == '__main__':
""" Why not using eland ?
......@@ -96,6 +116,7 @@ if __name__ == '__main__':
template_dir = os.path.join(os.path.dirname(__file__), "eda_templates")
jinja_env = Environment(loader=FileSystemLoader(template_dir))
template = jinja_env.get_template("filter_by_state_and_date.j2")
query = template.render(state="Auvergne")
query = template.render(state="aquitaine")
df_results = elasticquery(es_url, index_es, query)
df_results.to_csv("/home/rdecoupe/Téléchargements/yip2.csv")
df_tfidf = tf_idf(df_results["text"].tolist())
df_tfidf.to_csv("/home/rdecoupe/Téléchargements/yip3.csv")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment