Commit c59ec3c5 authored by Fize Jacques's avatar Fize Jacques
Browse files

Initial Commit

parent 685555ab
include gazpy/resources/*
\ No newline at end of file
# coding = utf-8
from .gazetteer.geonames import *
from .gazetteer.geodict import *
from .helpers import *
\ No newline at end of file
# coding = utf-8
from ..query.query_builder import QueryBuilder
import pandas as pd
import re, os ,inspect
from ..objectify import objectify as ob
__geo_res_path=os.path.join(os.path.dirname(inspect.getfile(ob)),"resources/")
geo_term={
"fr":open(__geo_res_path.rstrip("/")+"/geo_term_fr").read().lower().strip().split("\n"),
"en":open(__geo_res_path.rstrip("/")+"/geo_term_en").read().strip().split("\n")
}
def parse_label2(label : str,lang):
if not lang in geo_term:
return parse_label(label)
label = re.sub("[ ]+", " ", re.sub('[(?)]+', "", label.strip()))
label = label.strip("'").strip("’")
parts=label.split(" ")
# f=False
# for part in parts:
# if part.lower() in geo_term[lang]:
# f=True
# if not f:
# return parse_label(label)
new_labels=[]
for part in parts:
if not part.lower() in geo_term[lang]:
new_labels.append(parse_label(part).strip("/?")+"+")
else:
new_labels.append(parse_label(part).strip("/"))
return "/"+"[ ]?".join(new_labels)+"/"
def parse_label(label: str):
"""
Parse label/toponym to a specific regular expression that allows dissimilarity with the official toponyms/aliases.
Parameters
----------
label : str
toponym
Returns
-------
str
regular expression built from the toponym
"""
label = re.sub("[ ]+", " ", re.sub('[(?)]+', "", label.strip()))
label = label.strip("'").strip("’")
new_label = ""
for c in label:
if c.isupper():
close_par = ")" if not (new_label.endswith(")") or new_label.endswith("?")) and new_label != "" else ""
# if new_label.endswith("]"):
# new_label = new_label[:-1] + "({0}{1}]".format(c.lower(), c)
# else:
new_label += close_par + "([{0}{1}]".format(c.lower(), c)
# print("upper", new_label)
elif c == " ":
new_label += ")?[ ]?"
# print("espace", new_label)
elif c == "'" or c == "’":
new_label += c + ")?"
# print("apostrophe", new_label)
else:
new_label += ("(" if new_label == "" else "") + ("(" if new_label.endswith("?") else "") + c
# print("else", new_label)
new_label = "/" + new_label + ")?/"
return new_label
class Base():
"""
Base class for getter
"""
def __init__(self,es_client,field_score="score"):
"""Constructor for Base"""
self.score_field=field_score
self.qb=QueryBuilder()
self.es_client=es_client
def get_by_label(self,label,lang,score=True,size=1):
raise NotImplementedError()
def get_by_alias(self,alias,lang,score=True,size=1):
raise NotImplementedError()
def get_n_label_similar(self,label,lang,n,score=True):
raise NotImplementedError()
def get_n_alias_similar(self,alias,lang,n,score=True):
raise NotImplementedError()
def get_in_radius(self,lon,lat,score=True,size=1):
raise NotImplementedError()
def get_by_id(self,id):
raise NotImplementedError()
def convert_es_to_pandas(self,es_query_results):
"""
Return a `pandas.Dataframe` object built from the elasticsearch query results
Parameters
----------
es_query_results : dict
elasticsearch.search() result
Returns
-------
pandas.DataFrame
Dataframe of the elasticsearch query results
"""
if es_query_results["hits"]["total"] == 0:
return None
df = pd.DataFrame([g["_source"] for g in es_query_results["hits"]["hits"]])
if self.score_field in df:
df[self.score_field] = df[self.score_field].apply(lambda x: float(x))
else:
df[self.score_field] = df.apply(lambda x: 0)
df[self.score_field].fillna(-1, inplace=True)
return df
\ No newline at end of file
# coding = utf-8
from .base import Base,parse_label,parse_label2
class Geodict(Base):
""""""
def __init__(self,es_client):
"""Constructor for Geodict"""
Base.__init__(self,es_client)
def get_by_label(self, label, lang, score=True, size=1):
query=self.qb.query(term=True,field=lang,value=label,sorted=score,sorted_by=self.score_field,sized=True,size=size)
return self.convert_es_to_pandas(self.es_client.search("gazetteer","place",query))
def get_by_alias(self, alias, lang, score=True, size=1):
query = self.qb.query(term=True,nested=True,nested_field=lang,field="aliases", value=alias, sorted=score, sorted_by=self.score_field, sized=True,
size=size)
return self.convert_es_to_pandas(self.es_client.search("gazetteer", "place", query))
def get_n_label_similar(self, label, lang, n, score=True):
query = self.qb.query(query_string=True, regexp=True,regexp_value=parse_label2(label,lang), field=lang, value=label, sorted=score,
sorted_by=self.score_field, sized=True,
size=n)
return self.convert_es_to_pandas(self.es_client.search("gazetteer", "place", query))
def get_n_alias_similar(self, alias, lang, n, score=True):
query = self.qb.query(query_string=True, nested=True, nested_field=lang, regexp=True,regexp_value=parse_label2(alias,lang),field="aliases", value=alias, sorted=score,
sorted_by=self.score_field, sized=True,
size=n)
return self.convert_es_to_pandas(self.es_client.search("gazetteer", "place", query))
def get_in_radius(self, lon, lat, unit="km",distance=10, score=True, size=1):
query = self.qb.query(match_all=True,in_radius=True,radius_size=distance,radius_unit=unit,radius_centroid=(lon,lat), sorted=score,
sorted_by=self.score_field, sized=True,geo_field="coord",
size=size)
return self.convert_es_to_pandas(self.es_client.search("gazetteer", "place", query))
def get_by_id(self, id):
query = self.qb.query(term=True, field="id", value=id, sized=True,
size=1)
return self.convert_es_to_pandas(self.es_client.search("gazetteer", "place", query))
def get_by_other_id(self,id,identifier="wikidata"):
if not identifier in ['wikidata','geonames']:
raise Exception("Identfier type must be taken from the following items : 'wikidata' or 'geonames'")
if identifier == 'wikidata':
id_field="wikidataID"
else:
id_field="geonameID"
query = self.qb.query(term=True, field=id_field, value=id, sized=True,
size=1)
return self.convert_es_to_pandas(self.es_client.search("gazetteer", "place", query))
\ No newline at end of file
# coding = utf-8
from .base import Base,parse_label,parse_label2
class Geonames(Base):
"""
"""
def __init__(self,es_client,score="dem"):
"""Constructor for Geodict"""
Base.__init__(self,es_client,field_score=score)
def get_by_label(self, label, lang, score=True, size=1):
query=self.qb.query(term=True,field="name",value=label,sorted=score,sorted_by=self.score_field,sized=True,size=size)
return self.convert_es_to_pandas(self.es_client.search("geonames","geoname",query))
def get_by_alias(self, alias, lang, score=True, size=1):
query = self.qb.query(term=True,field="alternativenames", value=alias, sorted=score, sorted_by=self.score_field, sized=True,
size=size)
return self.convert_es_to_pandas(self.es_client.search("geonames", "geoname", query))
def get_n_label_similar(self, label, lang, n, score=True):
query = self.qb.query(query_string=True, regexp=True,regexp_value=parse_label2(label,lang), field="name", value=label, sorted=score,
sorted_by=self.score_field, sized=True,
size=n)
return self.convert_es_to_pandas(self.es_client.search("geonames", "geoname", query))
def get_n_alias_similar(self, alias, lang, n, score=True):
query = self.qb.query(query_string=True, regexp=True,regexp_value=parse_label2(alias,lang),field="alternativenames", value=alias, sorted=score,
sorted_by=self.score_field, sized=True,
size=n)
return self.convert_es_to_pandas(self.es_client.search("geonames", "geoname", query))
def get_in_radius(self, lon, lat, unit="km",distance=10, score=True, size=1):
query = self.qb.query(match_all=True,in_radius=True,radius_size=distance,radius_unit=unit,radius_centroid=(lon,lat), sorted=score,
sorted_by=self.score_field, sized=True,geo_field="coordinates",
size=size)
return self.convert_es_to_pandas(self.es_client.search("geonames", "geoname", query))
def get_by_id(self, id):
query = self.qb.query(term=True, field="geonameid", value=id, sized=True,
size=1)
return self.convert_es_to_pandas(self.es_client.search("geonames", "geoname", query))
# coding = utf-8
import numpy as np
\ No newline at end of file
#!/usr/bin/env python
"""Scrap module.
Just tiny bits & bolts.
.. author: Adrian Castravete
.. modified by : Jacques Fize (Implemented for Python 3 and recursive objectification)
"""
from functools import wraps
def objectify(func):
"""Mimic an object given a dictionary.
Given a dictionary, create an object and make sure that each of its
keys are accessible via attributes.
If func is a function act as decorator, otherwise just change the dictionary
and return it.
:param func: A function or another kind of object.
:returns: Either the wrapper for the decorator, or the changed value.
Example::
>>> obj = {'old_key': 'old_value'}
>>> oobj = objectify(obj)
>>> oobj['new_key'] = 'new_value'
>>> print oobj['old_key'], oobj['new_key'], oobj.old_key, oobj.new_key
>>> @objectify
... def func():
... return {'old_key': 'old_value'}
>>> obj = func()
>>> obj['new_key'] = 'new_value'
>>> print obj['old_key'], obj['new_key'], obj.old_key, obj.new_key
"""
def create_object(value):
"""Create the object.
Given a dictionary, create an object and make sure that each of its
keys are accessible via attributes.
Ignore everything if the given value is not a dictionary.
:param value: A dictionary or another kind of object.
:returns: Either the created object or the given value.
"""
if isinstance(value, dict):
# Build a simple generic object.
class Object(dict):
def __setitem__(self, key, val):
setattr(self, key, val)
return super(Object, self).__setitem__(key, val)
# Create that simple generic object.
ret_obj = Object()
# Assign the attributes given the dictionary keys.
for key, val in value.items():
if isinstance(val,dict):
ret_obj[key] = objectify(val)
else:
ret_obj[key] = val
setattr(ret_obj, key, val)
return ret_obj
else:
return value
# If func is a function, wrap around and act like a decorator.
if hasattr(func, '__call__'):
@wraps(func)
def wrapper(*args, **kwargs):
"""Wrapper function for the decorator.
:returns: The return value of the decorated function.
"""
value = func(*args, **kwargs)
return create_object(value)
return wrapper
# Else just try to objectify the value given.
else:
return create_object(func)
# coding = utf-8
class Parser(object):
"""
"""
def __init__(self, ):
"""Constructor for Parser"""
def parse_query_output(self):
pass
class GeodictParser(Parser):
"""
"""
def __init__(self):
Parser.__init__(self)
def parse_query_output(self):
pass
class GeonamesParser(Parser):
"""
"""
def __init__(self, ):
"""Constructor for GeonamesParser"""
Parser.__init__(self)
def parse_query_output(self):
pass
# coding = utf-8
import warnings
def is_number(x):
return (isinstance(x,float) or isinstance(x,int))
class QueryBuilder():
"""
"""
def __init__(self):
"""Constructor for QueryBuilder"""
pass
def query(self,**kwargs):
#Query Type
term=kwargs.get("term",False)
query_string = kwargs.get("query_string", False)
nested = kwargs.get("nested", False)
match_all=kwargs.get("match_all",False)
#Value
value=kwargs.get("value","")
# Additional Filter
sorted = kwargs.get("sorted", False)
regexped = kwargs.get("regexp", False)
sized = kwargs.get("sized", False)
min_valued = kwargs.get("min_valued", False)
max_valued = kwargs.get("max_valued", False)
in_radius= kwargs.get("in_radius",False)
# Additional Filter Value
field = kwargs.get("field", "")
nested_field = kwargs.get("nested_field", "")
sorted_by = kwargs.get("sorted_by", "")
size = kwargs.get("size", 1)
min_value = kwargs.get("min_value", 0)
max_value = kwargs.get("max_value", 2000000)
radius_size = kwargs.get("radius_size", 1)
radius_unit = kwargs.get("radius_unit", "km")
radius_centroid=kwargs.get("radius_centroid",(0,0))
regexp_value = kwargs.get("regexp_value", "")
geo_field=kwargs.get("geo_field","coord")
if match_all and (query_string or term or nested):
raise Exception("Match all can't be combine with other queries!")
if term and query_string:
raise Exception("Impossible to have term and query_string at the same time")
if nested and (not isinstance(nested_field,str) or not nested_field):
raise Exception("You forgot to indicate the nested_field name !")
if regexped and term:
raise Exception("You can't use regexp with term query")
if (not field and not match_all) or not isinstance(field,str) :
raise Exception("Missing field name or bad type!")
if not self.check_consistency(sorted,regexped,sized,min_valued,max_valued,in_radius,
sorted_by,size,min_value,max_value,radius_size,radius_unit,regexp_value,geo_field):
raise Exception("Args of query() are not consistent! Double Check!")
body_query={
"query":{}
}
if query_string:
body_query["query"]={"query_string":{"default_field":field,"query":value if not regexped else regexp_value}}
elif term:
body_query["query"]={"term":{field:value}}
elif match_all:
body_query["query"] = {"match_all":{}}
body_query["query"]={"bool":{"must":[body_query["query"]]}}
if min_valued:
body_query["query"]["bool"]["must"].append({"range": {"gt": min_value}})
if max_valued:
body_query["query"]["bool"]["must"].append({"range": {"lt": max_value}})
if in_radius:
body_query["query"]["bool"]["filter"]={"geo_distance":{"distance":"{0}{1}".format(radius_size,radius_unit),geo_field:{"lon":radius_centroid[0],"lat":radius_centroid[1]}}}
if nested:
if query_string:
body_query["query"]["bool"]["must"][0]["query_string"]["default_field"]=".".join([field,nested_field])
elif term:
body_query["query"]["bool"]["must"][0]= {"term":{".".join([field,nested_field]):value}}
body_query["query"]={"nested":{"path":field,"query":body_query["query"]}}
if sorted:
body_query["sort"]=[{sorted_by:"desc"}]
if sized:
body_query["size"]=size
return body_query
def check_consistency(self,sorted,regexped,sized,min_valued,max_valued,in_radius,
sorted_by,size,min_value,max_value,radius_size,radius_unit,regexp_value,geo_field):
if sorted and (not isinstance(sorted_by,str) or not sorted_by):
warnings.warn("sorted_by must not be an empty and should be a str")
return False
if sized and (not is_number(size) or not size):
warnings.warn("sorted_by must not be an empty and should be a str")
return False
if min_valued and (not is_number(min_value) or not min_value):
warnings.warn("sorted_by must not be an empty and should be a str")
return False
if max_valued and (not is_number(max_value) or not max_value):
warnings.warn("sorted_by must not be an empty and should be a str")
return False
if in_radius and (not isinstance(geo_field,str) or not is_number(radius_size) or not isinstance(radius_unit,str) or not radius_unit or not radius_size or not geo_field):
warnings.warn("sorted_by must not be an empty and should be a str")
return False
if regexped and (not isinstance(regexp_value,str) or not regexp_value):
warnings.warn("sorted_by must not be an empty and should be a str")
return False
return True
def test_consistency():
pass
\ No newline at end of file
absolute humidity
absolute location
accessibility resource
accessibility
acid rain
active volcano
agricultural geography
air mass
alluvia
alluvial soils
altitude
antarctic
antarctic circle
anthracite
anthropization
anticline
antimeridian
antipodes
aquifer
archipelago
arête
arroyo
arctic
arctic circle
ash
atlantic seaboard fall line
atlas
atmosphere
atoll
ayre
azimuth
badlands
barrier ridge
base level
basin
batholith
bay
beach
bearing
bedrock
bight
biological diversity
biogeography
biosphere
biota
bituminous
blowout
bocage
body of water
bootheel
border
break-in-bulk point
built environment
butte
calanque
caldera
canal
canyon
cape
caprock
cardinal directions
carrying capacity
cartography
cartogram
cave
cay
cenote
central business district
census-designated place (cdp)
channel
chaparral
chimney
chinook
chorography
cinder cone
circle of latitude
cirque
city
city-state
cliff
climax vegetation
coast
col
colony
commonwealth
compass
compass rose
confluence
coniferous
contiguity
continent
continental climate
continental divide
continental shelf
continentality
contour lines
conurbation
corrasion
core area