put GeoVirus into rubrix

79ec0dec · Decoupes Remy · 068ad50d · 79ec0dec · 79ec0dec
Commit 79ec0dec authored 3 years ago by Decoupes Remy
Hide whitespace changes
Inline Side-by-side

Showing

with 38 additions and 0 deletions
+38 -0
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,6 @@
 *.swp
 # old scripts
 first_dataset.py
+# data
+GeoVirus.json
--- a/geovirus2rubrix.py
+++ b/geovirus2rubrix.py
+import json
+from tqdm.auto import tqdm
+import rubrix as rb
+import spacy
+nlp = spacy.load("en_core_web_sm") # for tokenization
+with open('GeoVirus.json', 'r') as infile:
+	data = json.load(infile)
+records = []
+for news_article in tqdm(list(data["articles"]["article"])): # Loop on corpus' documents
+	doc = nlp(news_article["text"])
+	entities = []
+	for ner in news_article["locations"]["location"]: # loop on annotated entities
+        # it seems that GeoVirus positioning, i.e., the index of the character of the beginning
+        # and end of word/token not start at 0 but at 1.
+        # we have to subtract 1 to each position
+		entities.append(
+			("loc", int(ner["start"]) -1 , int(ner["end"]) -1)
+			)
+	tokens = [token.text for token in doc]
+	records.append(
+		rb.TokenClassificationRecord(
+			text=news_article["text"],
+			tokens=tokens,
+			prediction=entities,
+			prediction_agent="geovirus"
+		)
+	)
+rb.log(records=records, name="geovirus")