property_extract.py 1.45 KiB
# coding=utf-8
import json
from gzip import GzipFile
import sys
from collections import defaultdict
import time
from wikidata.helpers import *
from wikidata.types_wd import *
from wikidata.entity_wd import *
from wikidata.property_wd import *
from wikidata.reader import Reader
from wikidata.process_wd import *



class PropertyExtract(Process):
    def __init__(self, id, properties, data):
        super(PropertyExtract, Process.__init__(self, id))
        self.dataframe = {}
        #self.extract_prop = Property(prop, istype, String())
        self.properties_to_extract = properties['properties_to_extract']
        self.isType = {
          "EntityID":EntityID(),
          "String":String(),
          "Coordinates":Coordinates(),
          "URL":URL(),
          "Time":Time(),
          "Quantity":Quantity(),
          "ExternalIdentifier":ExternalIdentifier()
        }
        print("Extracting property...")

        f = open(data, encoding='utf-8')
        self.dataframe = json.load(f)
        f.close()

    def processItem(self, entry):
        if entry['id'] in self.dataframe.keys():
            for prop in self.properties_to_extract:
                temp_prop =Property(prop['id'],prop["isMultiple"],self.isType[prop['type']])
                if temp_prop.exists(entry):
                    try:
                        self.dataframe[entry['id']][prop['id']] = temp_prop.extractData(entry)
                    except:
                        print(entry["id"])