tf-idf-es.py 3.09 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
"""
@brief: Download metric from Elasticsearch to easily compute TF-IDF matric
@author: R.Decoupes
@copyright: CeCILL-B

Download metric from Elasticsearch to easily compute TF-IDF matric
"""
from elasticsearch import Elasticsearch
from jinja2 import FileSystemLoader, Environment
import os
11
12
13
import eland as ed
import requests
import pandas as pd
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from tqdm import tqdm
import json

def elastic_pagination_scrolling(result, headers):
    """
    Elasticsearch limit results of query at 10 000. To avoid this limit, we need to paginate results and scroll
    This method append all pages form scroll search
    :param result: a result of a ElasticSearcg query
    :return:
    """
    scroll_size = result['hits']['total']["value"]
    results = []
    # Progress bar
    pbar = tqdm(total=scroll_size)
    while (scroll_size > 0):
        try:
            scroll_id = result['_scroll_id']
            # res = client.scroll(scroll_id=scroll_id, scroll='60s')
            query = {
                "scroll": "1m",
                "scroll_id": scroll_id
            }
            query_json = json.dumps(query)
            res = requests.get(es_url + "_search/scroll",
                               data=query_json,
                               headers=headers,
                               ).json()
            results += res['hits']['hits']
            scroll_size = len(res['hits']['hits'])
            pbar.update(scroll_size)
        except:
            pbar.close()
            break
    pbar.close()
    return results

50
51

if __name__ == '__main__':
52
53
54
55
56
57
58
59
60
61
    """ Why not using eland ?
    # we could not filter eland df with rest.features.properties.state certainly because there are to much fields ?
    # Two solutions :
    #   - we are using eland with "ed_tweets["rest_user_osm.extent"] and intersect with polygones
    #   - we use elasticsearch package with Jinja2 template and normalization
    # We prefere to use elasticsearch
    """
    #ed_tweets = ed.DataFrame("http://mo-mood-tetis-tweets-collect.montpellier.irstea.priv:9200",
    #                         es_index_pattern="mood-tetis-tweets-collect")

62
63
64
65
66
    es_url = "http://mo-mood-tetis-tweets-collect.montpellier.irstea.priv:9200/"
    client_es = Elasticsearch(es_url)
    index_es = "mood-tetis-tweets-collect"
    # init jinja2 configuration
    template_dir = os.path.join(os.path.dirname(__file__), "eda_templates")
67
68
69
70
71
    jinja_env = Environment(loader=FileSystemLoader(template_dir))
    template = jinja_env.get_template("filter_by_state_and_date.j2")
    query = template.render(state="Auvergne")
    headers = {'content-type': 'application/json'}
    try:
72
73
74
75
        r = requests.get(es_url + index_es + "/_search?scroll=1m&size=1000", # context for scrolling up to 1 minute
                         data=query,
                         headers=headers,
                         )
76
77
    except Exception as e:
        print("Can not filter by date and space")
78
79
80
81
    results = elastic_pagination_scrolling(r.json(), headers)
    print(results)
    print(type(results))
    # df_results = pd.json_normalize(r.json()["hits"]["hits"])
82
    #df_results.to_csv("/home/rdecoupe/Téléchargements/yip.csv")