Commit 3250da39 authored by Interdonato Roberto's avatar Interdonato Roberto

Upload New File

parent 7d022e3e
import os.path
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from time import time
import numpy as np
from nltk.corpus import stopwords
import pandas as pd
n_features = 10000
ntopics=10
stop_words = set(stopwords.words('english'))
def preproc_csv(path,out):
fout = open(out,'w',encoding='utf-8')
df = pd.read_csv(path,index_col=0,sep=';')
fout.write("id\tabs\n")
for index, row in df.iterrows():
abs = row['description']
if type(abs)==str:
fout.write("%d\t%s\n" % (index,abs))
fout.close()
def ldaModel(pd_col,topics=ntopics):
texts = []
for t in pd_col:
texts.append(t.strip())
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
t0 = time()
tf_vectorizer = CountVectorizer(max_df=0.5, min_df=5, max_features=n_features, stop_words=stop_words)
#tf_vectorizer = CountVectorizer(max_df=0.5, min_df=5,max_features=n_features)
tf = tf_vectorizer.fit_transform(texts)
print("Fitting LDA models with tf features, "
" n_features=%d..."
% (n_features))
lda = LatentDirichletAllocation(n_components=topics,
learning_method='online',
learning_offset=50.,
random_state=0)
lda.fit(tf)
topic_pr = lda.transform(tf)
print("done in %0.3fs." % (time() - t0))
return lda,tf_vectorizer,topic_pr
# Show top n keywords for each topic
def show_topics(vectorizer, lda_model, n_words=20):
keywords = np.array(vectorizer.get_feature_names())
topic_keywords = []
for topic_weights in lda_model.components_:
top_keyword_locs = (-topic_weights).argsort()[:n_words]
topic_keywords.append(keywords.take(top_keyword_locs))
return topic_keywords
os.chdir('D:\Mes Donnees\Papers\Modeling-Forecast text mining')
path = "test_jeremy1.csv"
out = "test_jeremy1_lda_input.csv"
preproc_csv(path,out)
df_ldainput = pd.read_csv(out,sep='\t')
lda,vec,topic_pr = ldaModel(df_ldainput["abs"])
fout_count = open('pubs_topics.csv','w')
#tdict = {}
fw = open('pubs_topics.csv','w')
fw.write('paper_id;topic\n')
for index, row in enumerate(topic_pr):
top = row.argmax()
#tdict[df_ldainput.loc[index]["id"]] = top
fw.write("%d;%d\n" % (df_ldainput.iloc[index]["id"], top))
fw.close()
topic_keywords = show_topics(vec, lda, n_words=15)
for t in topic_keywords:
print(t)
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
fc = open("topics2.csv",'w',encoding='utf-8')
fc.write(df_topic_keywords.to_csv())
fc.close()
print("done")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment