Commit 4e72b97e authored by Decoupes Remy's avatar Decoupes Remy
Browse files

add time normalization into nlp pipeline using [timexy](https://github.com/paulrinckens/timexy)

parent ca6f1856
No related merge requests found
Showing with 93 additions and 8 deletions
+93 -8
......@@ -18,6 +18,7 @@ import requests
import rubrix as rb
from transformers import pipeline
from tqdm import tqdm
from timexy import Timexy #https://github.com/paulrinckens/timexy
if __name__ == '__main__':
# Corpus size: nb of tweets to process
......@@ -33,8 +34,16 @@ if __name__ == '__main__':
translator_fr = pipeline("translation", model=model_checkpoint_fr)
model_checkpoint_it = "Helsinki-NLP/opus-mt-it-en"
translator_it = pipeline("translation", model=model_checkpoint_it)
# Configure timexy type
config = {
"kb_id_type": "timex3", # possible values: 'timex3'(default), 'timestamp'
"label": "timexy", # default: 'timexy'
"overwrite": False # default: False
}
nlp_en.add_pipe("timexy", config=config, before="ner")
is_SNE = lambda x: True if (x.label_ == "LOC" or x.label_ == "GPE" or x.label_ == "FAC") else False
is_TNE = lambda x: True if x.label_ == "DATE" else False
# Configure geocoding
url_photon = "https://photon.komoot.io/api/?q="
"""
......@@ -56,30 +65,36 @@ if __name__ == '__main__':
ner = nlp_en(text_en)
list_of_normalize_SNE = []
list_of_normalize_TNE = []
SNE_found = False
TNE_found = False
if rubrix == True:
rubrix_tweet_records = []
for entity in ner.ents:
if is_SNE(entity):
SNE_found = True
reponses = requests.get(url_photon + str(entity))
try:
list_of_normalize_SNE.append(reponses.json()['features'][0])
except:#when the NER pipeline had wrong label a SNE
pass
# reponses = requests.get(url_photon + str(entity))
# try:
# list_of_normalize_SNE.append(reponses.json()['features'][0])
# except:#when the NER pipeline had wrong label a SNE
# pass
if rubrix == True:
rubrix_tweet_records.append((entity.label_, entity.start_char, entity.end_char))
if rubrix == True and SNE_found == True: #log in rubrix only if we found a SNE
if is_TNE(entity):
TNE_found = True
if rubrix == True:
rubrix_tweet_records.append((entity.label_, entity.start_char, entity.end_char))
if rubrix == True and (SNE_found == True or TNE_found == True): #log in rubrix only if we found a SNE or TNE
tokens = [token.text for token in ner]
rubrix_records.append(
rb.TokenClassificationRecord(
text=text_en + "\n translate from" + tweet["text"],
tokens=tokens,
prediction=rubrix_tweet_records,
prediction_agent="geocode_tweet_content-translation"
prediction_agent="geocode_tweet_content_translation_TNE"
)
)
if rubrix == True:
rb.log(records=rubrix_records, name="geocode_tweet_content-translation")
rb.log(records=rubrix_records, name="geocode_tweet_content_translation_TNE")
......@@ -68,3 +68,73 @@ wasabi==0.9.0
wrapt==1.13.3
xx-ent-wiki-sm @ https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.2.0/xx_ent_wiki_sm-3.2.0-py3-none-any.whl
zipp==3.7.0
anyio==3.5.0
blis==0.7.6
catalogue==2.0.6
certifi==2021.10.8
charset-normalizer==2.0.12
click==8.0.4
cycler==0.11.0
cymem==2.0.6
eland==8.0.0
elastic-transport==8.1.0
elasticsearch==8.1.0
en-core-web-trf @ https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.2.0/en_core_web_trf-3.2.0-py3-none-any.whl
filelock==3.6.0
fonttools==4.31.1
fr-core-news-lg @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_lg-3.2.0/fr_core_news_lg-3.2.0-py3-none-any.whl
h11==0.9.0
httpcore==0.11.1
httpx==0.15.5
huggingface-hub==0.4.0
idna==3.3
importlib-metadata==4.11.3
it-core-news-lg @ https://github.com/explosion/spacy-models/releases/download/it_core_news_lg-3.2.0/it_core_news_lg-3.2.0-py3-none-any.whl
Jinja2==3.0.3
joblib==1.1.0
kiwisolver==1.4.0
langcodes==3.3.0
loguru==0.6.0
MarkupSafe==2.1.0
matplotlib==3.5.1
murmurhash==1.0.6
numpy==1.21.5
packaging==21.3
pandas==1.3.5
pathy==0.6.1
Pillow==9.0.1
preshed==3.0.6
pydantic==1.8.2
pyparsing==3.0.7
python-dateutil==2.8.2
pytz==2021.3
PyYAML==6.0
regex==2022.3.2
requests==2.27.1
rfc3986==1.5.0
rubrix==0.12.1
sacremoses==0.0.47
sentencepiece==0.1.96
six==1.16.0
smart-open==5.2.1
sniffio==1.2.0
spacy==3.2.3
spacy-alignments==0.8.4
spacy-legacy==3.0.9
spacy-loggers==1.0.1
spacy-transformers==1.1.4
srsly==2.4.2
starlette==0.19.0
thinc==8.0.13
timexy==0.1.3
tokenizers==0.10.3
torch==1.11.0
tqdm==4.63.0
transformers==4.15.0
typer==0.4.0
typing-extensions==3.10.0.2
urllib3==1.26.8
wasabi==0.9.0
wrapt==1.13.3
xx-ent-wiki-sm @ https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.2.0/xx_ent_wiki_sm-3.2.0-py3-none-any.whl
zipp==3.7.0
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment