Commit bbb0bfe7 authored by Interdonato Roberto's avatar Interdonato Roberto

Upload New File

parent 21df6bc6
import itertools
import pandas as pd
import string
import datetime
def get_distance(w1, w2):
if w1 in words and w2 in words:
w1_indexes = [index for index, value in enumerate(words) if value == w1]
w2_indexes = [index for index, value in enumerate(words) if value == w2]
distances = [abs(item[0] - item[1]) for item in itertools.product(w1_indexes, w2_indexes)]
return sum(distances)/float(len(distances))
group1 = ['prospective', 'anticipation', 'foresights', 'forecast', 'future studies', 'prospective thinking', 'strategic planning', 'prospeccion', 'prospectiva']
group2 = ['modélisation', 'modelling','modeling','scenario', 'planning', 'mapping', 'simulation', 'quantitative','assessement', 'qualitative', 'narrative','scenarios','modelado']
#input_f = "test_jeremy1.csv"
input_f = "test_jeremy1_abs-title-key.csv"
start_date = 2010
end_date=2020
df = pd.read_csv(input_f, index_col=0, sep=';')
fout = open("distance_words_%d_%d_tmp.csv" % (start_date,end_date),'w')
#fout = open("distance_words_abs-title-key_%d_%d.csv" % (start_date,end_date),'w')
fout.write("first appearing group;avg distance\n")
for index, row in df.iterrows():
cd = row["coverDate"]
try:
date_time_obj = datetime.datetime.strptime(cd, '%Y-%m-%d')
y = date_time_obj.year
except TypeError:
print(cd)
except ValueError:
print(cd)
if date_time_obj.year>=start_date and date_time_obj.year<=end_date:
# Remove the leading spaces and newline character
line = str(row["title"]).strip().translate(str.maketrans('', '', string.punctuation))
#line = str(row["description"]).strip().translate(str.maketrans('', '', string.punctuation))
# Convert the characters in line to
# lowercase to avoid case mismatch
line = line.lower()
# Split the line into words
words = line.split(" ")
words_to_test = []
# Iterate over each word in line
for word in words:
if word in group1 or word in group2:
words_to_test.append(word)
if len(words_to_test)>1:
g=-1
if words_to_test[0] in group1:
g=1
elif words_to_test[0] in group2:
g=2
dist = 0
c=0
for w in words_to_test:
for q in words_to_test:
if q!=w:
dist+=get_distance(w,q)
c+=1
if c!=0:
dist/=c
if dist==0 or dist==1:
print("DIST:",dist)
print(line)
print(words_to_test)
print("++++++++++++++++++")
fout.write("%s;%f\n" % (g,dist))
fout.close()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment