generate_data_csv.py 4.14 KiB
# coding = utf-8

# coding = utf-8
import argparse,glob, string,time,re

from progressbar import ProgressBar, Timer, Bar, ETA, Counter

from strpython.models.str import STR
from strpython.nlp.disambiguator.share_prop import *
from strpython.pipeline import *
import pandas as pd
import networkx as nx

def filter_nonprintable(text):
    # Get the difference of all ASCII characters from the set of printable characters
    nonprintable = set([chr(i) for i in range(128)]).difference(string.printable)
    # Use translate to remove all non-printable characters
    return text.translate({ord(character):None for character in nonprintable})

parser = argparse.ArgumentParser()
parser.add_argument("csv_input_dir")
parser.add_argument("graphs_output_dir")
parser.add_argument("metadata_output_fn")

subparsers = parser.add_subparsers(help='commands')

normal = subparsers.add_parser(
    'normal', help='Basic STR generation. No argument are necessary !')
normal.set_defaults(which="norm")


gen_parser = subparsers.add_parser(
    'generalisation', help='Apply a generalisation transformation on the generated STRs')
gen_parser.set_defaults(which="gene")
gen_parser.add_argument(
    '-t','--type_gen', help='Type of generalisation',default="all")
gen_parser.add_argument(
    '-n', help='Language',default=1)
gen_parser.add_argument(
    '-b','--bound', help='If Generalisation is bounded, this arg. correspond'
                         'to the maximal ',default="country")

ext_parser = subparsers.add_parser(
    'extension', help='Apply a extension process on the STRs')
ext_parser.set_defaults(which="ext")
ext_parser.add_argument(
    '-d','--distance', help='radius distance',default=150)
ext_parser.add_argument(
    '-u','--unit', help='unit used for the radius distance',default="km")
ext_parser.add_argument(
    '-a','--adjacent_count', help='number of adjacent SE add to the STR',default=1)

args = parser.parse_args()
if "which" in args:
    if args.which =="gene":
        args.type_trans="gen"
    elif args.which =="ext":
        args.type_trans="ext"

print("Parameters entered : ",args)

if os.path.exists(args.csv_input_dir):
    files_glob= glob.glob(args.csv_input_dir+"/*.csv")
if not files_glob:
    files_glob = glob.glob(args.csv_input_dir + "/*.txt")
else:
    exit()

if not os.path.exists(args.graphs_output_dir):
    os.makedirs(args.graphs_output_dir)
start = time.time()

associated_es={}
count_per_doc={}
i=0
#logging.info("Get associated spatial entities and ")
with ProgressBar(max_value=len(files_glob),widgets=[' [', Timer(), '] ',Bar(),'(', Counter(),')','(', ETA(), ')']) as pg:
    for fn in files_glob:

        id_=int(re.findall("\d+", fn)[-1])
        df=pd.read_csv(fn)
        df = df[-df["GID"].isin(['0', 'o', 'NR', 'O'])]
        try:
            count_per_doc[id_]=json.loads(df.groupby("GID").GID.count().to_json())
            associated_es[id_] = df[["GID","text"]].groupby("GID",as_index=False).max().set_index('GID').to_dict()["text"]
        except:
            count_per_doc[id_]={}
            associated_es[id_]={}
        pg.update(i)
        i+=1
#logging.info("Fetch list of spatial entities available !")
all_es=set([])
for k,v in associated_es.items():
    for k2 in v:
        all_es.add(k2)



i=0
def foo_(x):
    try:
        return get_data(x)["en"]
    except:
        print(x)
with ProgressBar(max_value=len(files_glob),
                 widgets=[' [', Timer(), '] ', Bar(), '(', Counter(), ')', '(', ETA(), ')']) as pg:
    for fn in files_glob:

        id_ = int(re.findall("\d+", fn)[-1])
        df = pd.read_csv(fn)
        # try:
        df= df.fillna("O")
        df= df[-df["GID"].isin(['0','o','NR','O'])]
        #print(df)
        # except:
        #     df = df[(df.GID.notnull())]
        #     print("BUG",df)
        df["label"]=df.GID.apply(foo_)
        df = df.rename(columns={"GID": "id"})
        str_=STR.from_pandas(df,[]).build()
        nx.write_gexf(str_, args.graphs_output_dir + "/{0}.gexf".format(id_))
        i+=1
        pg.update(i)


# Save Metadata
open(os.path.join(args.graphs_output_dir,args.metadata_output_fn),'w').write(json.dumps([associated_es,count_per_doc],indent=4))


print("--- %s seconds ---" % (time.time() - start))