Commit 79ec0dec authored by Decoupes Remy's avatar Decoupes Remy
Browse files

put GeoVirus into rubrix

parent 068ad50d
No related merge requests found
Showing with 38 additions and 0 deletions
+38 -0
...@@ -2,4 +2,6 @@ ...@@ -2,4 +2,6 @@
*.swp *.swp
# old scripts # old scripts
first_dataset.py first_dataset.py
# data
GeoVirus.json
import json
from tqdm.auto import tqdm
import rubrix as rb
import spacy
nlp = spacy.load("en_core_web_sm") # for tokenization
with open('GeoVirus.json', 'r') as infile:
data = json.load(infile)
records = []
for news_article in tqdm(list(data["articles"]["article"])): # Loop on corpus' documents
doc = nlp(news_article["text"])
entities = []
for ner in news_article["locations"]["location"]: # loop on annotated entities
# it seems that GeoVirus positioning, i.e., the index of the character of the beginning
# and end of word/token not start at 0 but at 1.
# we have to subtract 1 to each position
entities.append(
("loc", int(ner["start"]) -1 , int(ner["end"]) -1)
)
tokens = [token.text for token in doc]
records.append(
rb.TokenClassificationRecord(
text=news_article["text"],
tokens=tokens,
prediction=entities,
prediction_agent="geovirus"
)
)
rb.log(records=records, name="geovirus")
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment