Commit 68aa8cea authored by Interdonato Roberto's avatar Interdonato Roberto

Upload New File

parent 27b1925f
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import string
import datetime
group1 = ['prospective', 'anticipation', 'foresights', 'forecast', 'future studies', 'prospective thinking', 'strategic planning', 'prospeccion', 'prospectiva']
group2 = ['modélisation', 'modelling', 'scenario', 'planning', 'mapping', 'simulation', 'quantitative','assessement', 'qualitative', 'narrative','scenarios','modelado']
diz = dict()
stop_words = set(stopwords.words('english'))
input_f = "test_jeremy1.csv"
#input_f = "test_jeremy1_abs-title-key.csv"
df = pd.read_csv(input_f, index_col=0, sep=';')
text = ""
start_date = 1970
for index, row in df.iterrows():
cd = row["coverDate"]
date_time_obj = datetime.datetime.strptime(cd, '%Y-%m-%d')
y = date_time_obj.year
except TypeError:
except ValueError:
if date_time_obj.year>=start_date and date_time_obj.year<=end_date:
# Remove the leading spaces and newline character
line = str(row["title"]).strip().translate(str.maketrans('', '', string.punctuation))
#line = str(row["description"]).strip().translate(str.maketrans('', '', string.punctuation))
# Convert the characters in line to
# lowercase to avoid case mismatch
line = line.lower()
# Split the line into words
words = line.split(" ")
# Iterate over each word in line
for word in words:
#if word not in stop_words:
if word not in stop_words and word not in group1 and word not in group2:
text+=word+" "
tokens = nltk.word_tokenize(text)
# Create your bigrams
bgs = nltk.bigrams(tokens)
#fout = open("global_occs_ngrams.csv", 'w', encoding="utf-8")
fout = open("global_occs_ngrams_%d_%d_noGroupWords.csv" % (start_date,end_date), 'w', encoding="utf-8")
#compute frequency distribution for all the bigrams in the text
fdist = nltk.FreqDist(bgs)
for k,v in fdist.items():
if v>1:
fout.write("%s;%s\n" % (k,v))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment