""" @brief: Download metric from Elasticsearch to easily compute TF-IDF matric @author: R.Decoupes @copyright: CeCILL-B Download metric from Elasticsearch to easily compute TF-IDF matric """ from elasticsearch import Elasticsearch from jinja2 import FileSystemLoader, Environment import os import eland as ed import requests import pandas as pd from tqdm import tqdm import json def elastic_pagination_scrolling(result, headers): """ Elasticsearch limit results of query at 10 000. To avoid this limit, we need to paginate results and scroll This method append all pages form scroll search :param result: a result of a ElasticSearcg query :return: """ scroll_size = result['hits']['total']["value"] results = [] # Progress bar pbar = tqdm(total=scroll_size) while (scroll_size > 0): try: scroll_id = result['_scroll_id'] # res = client.scroll(scroll_id=scroll_id, scroll='60s') query = { "scroll": "1m", "scroll_id": scroll_id } query_json = json.dumps(query) res = requests.get(es_url + "_search/scroll", data=query_json, headers=headers, ).json() results += res['hits']['hits'] scroll_size = len(res['hits']['hits']) pbar.update(scroll_size) except: pbar.close() break pbar.close() return results if __name__ == '__main__': """ Why not using eland ? # we could not filter eland df with rest.features.properties.state certainly because there are to much fields ? # Two solutions : # - we are using eland with "ed_tweets["rest_user_osm.extent"] and intersect with polygones # - we use elasticsearch package with Jinja2 template and normalization # We prefere to use elasticsearch """ #ed_tweets = ed.DataFrame("http://mo-mood-tetis-tweets-collect.montpellier.irstea.priv:9200", # es_index_pattern="mood-tetis-tweets-collect") es_url = "http://mo-mood-tetis-tweets-collect.montpellier.irstea.priv:9200/" client_es = Elasticsearch(es_url) index_es = "mood-tetis-tweets-collect" # init jinja2 configuration template_dir = os.path.join(os.path.dirname(__file__), "eda_templates") jinja_env = Environment(loader=FileSystemLoader(template_dir)) template = jinja_env.get_template("filter_by_state_and_date.j2") query = template.render(state="Auvergne") headers = {'content-type': 'application/json'} try: r = requests.get(es_url + index_es + "/_search?scroll=1m&size=1000", # context for scrolling up to 1 minute data=query, headers=headers, ) except Exception as e: print("Can not filter by date and space") results = elastic_pagination_scrolling(r.json(), headers) print(results) print(type(results)) # df_results = pd.json_normalize(r.json()["hits"]["hits"]) #df_results.to_csv("/home/rdecoupe/Téléchargements/yip.csv")