Commit 417cdf7c authored by Decoupes Remy's avatar Decoupes Remy
Browse files

add Licence to the repo

No related merge requests found
Showing with 537 additions and 8 deletions
+537 -8
......@@ -369,13 +369,12 @@ def TFIDF_TF_with_corpus_state(elastic_query_fname, logger, nb_biggest_terms=500
# listOfCity = ['London', 'Glasgow', 'Belfast', 'Cardiff']
# listOfState = ["England", "Scotland", "Northern Ireland", "Wales"]
# Query Elasticsearch to get all tweets from UK
tweets = elasticsearch_query(elastic_query_fname, logger)
if listOfCities == 'all':
listOfCities = []
listOfStates = []
listOfCountry = []
for triple in tweetsByCityAndDate:
for triple in tweets:
splitted = triple.split("_")
listOfCities.append(splitted[0])
listOfStates.append(splitted[1])
......@@ -421,8 +420,8 @@ def TFIDF_TF_with_corpus_state(elastic_query_fname, logger, nb_biggest_terms=500
matrix_by_locality = matrixAllTweets[matrixAllTweets[spatial_hiearchy] == locality]
vectorizer = TfidfVectorizer(
stop_words='english',
#min_df=0.001,
max_features=50000,
min_df=0.001,
# max_features=50000,
ngram_range=(1, 4),
token_pattern='[a-zA-Z0-9#@]+',
)
......@@ -539,12 +538,11 @@ def TFIDF_TF_on_whole_corpus(elastic_query_fname, logger, path_for_filesaved="./
vectorizer = TfidfVectorizer(
stop_words='english',
#min_df=0.001,
max_features=50000,
min_df=0.001,
# max_features=50000,
ngram_range=(1, 4),
token_pattern='[a-zA-Z0-9#]+', #remove user name, i.e term starting with @ for personnal data issue
)
# logger.info("Compute TF-IDF on corpus = "+spatial_hiearchy)
try:
vectors = vectorizer.fit_transform(matrixAllTweets['tweet'])
feature_names = vectorizer.get_feature_names()
......@@ -812,7 +810,7 @@ def post_traitement_flood(biggest, logger, spatialLevel, ratio_of_flood=0.5):
if __name__ == '__main__':
# Workflow parameters :
## Rebuild H-TFIDF (with Matrix Occurence)
build_htfidf = True
build_htfidf = False
## eval 1 : Comparison with classical TF-IDf
build_classical_tfidf = True
## evla 2 : Use word_embedding with t-SNE
......
LICENSE 0 → 100644
This diff is collapsed.
......@@ -48,3 +48,15 @@ This is based upon works of:
* **Gaurav Shrivastava** who code FASTR algorithme in python. His script is in this repository
NB : **Due to the size of this corpus, biotex could not be launched on the full corpus. It has to be splitt in 30k tweets. Then results have to be merged and ranked**
## License
This code is provided under the [CeCILL-B](https://cecill.info/licences/Licence_CeCILL-B_V1-en.html) free software license agreement.
## Data Usage Agreement
By using the [E.Echen's dataset](https://github.com/echen102/COVID-19-TweetIDs), as [stated by the author](https://github.com/echen102/COVID-19-TweetIDs#data-usage-agreement) you agree to abide by the stipulations in the license, remain in compliance with Twitter’s [Terms of Service](https://developer.twitter.com/en/developer-terms/agreement-and-policy), and cite the following manuscript:
Chen E, Lerman K, Ferrara E
Tracking Social Media Discourse About the COVID-19 Pandemic: Development of a Public Coronavirus Twitter Data Set
JMIR Public Health Surveillance 2020;6(2):e19273
DOI: 10.2196/19273
PMID: 32427106
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment