add Licence to the repo

417cdf7c · Decoupes Remy · 5da02faa · 417cdf7c · 417cdf7c · 417cdf7c
Commit 417cdf7c authored 3 years ago by Decoupes Remy
Expand all Hide whitespace changes
Inline Side-by-side

Showing

with 537 additions and 8 deletions
+537 -8
--- a/COVID-19-TweetIDS-ES-Analyse.py
+++ b/COVID-19-TweetIDS-ES-Analyse.py
@@ -369,13 +369,12 @@ def TFIDF_TF_with_corpus_state(elastic_query_fname, logger, nb_biggest_terms=500
    # listOfCity = ['London', 'Glasgow', 'Belfast', 'Cardiff']
    # listOfState = ["England", "Scotland", "Northern Ireland", "Wales"]

-    # Query Elasticsearch to get all tweets from UK
    tweets = elasticsearch_query(elastic_query_fname, logger)
    if listOfCities == 'all':
        listOfCities = []
        listOfStates = []
        listOfCountry = []
-        for triple in tweetsByCityAndDate:
+        for triple in tweets:
            splitted = triple.split("_")
            listOfCities.append(splitted[0])
            listOfStates.append(splitted[1])
@@ -421,8 +420,8 @@ def TFIDF_TF_with_corpus_state(elastic_query_fname, logger, nb_biggest_terms=500
        matrix_by_locality = matrixAllTweets[matrixAllTweets[spatial_hiearchy] == locality]
        vectorizer = TfidfVectorizer(
            stop_words='english',
-            #min_df=0.001,
-            max_features=50000,
+            min_df=0.001,
+            # max_features=50000,
            ngram_range=(1, 4),
            token_pattern='[a-zA-Z0-9#@]+',
        )
@@ -539,12 +538,11 @@ def TFIDF_TF_on_whole_corpus(elastic_query_fname, logger, path_for_filesaved="./

    vectorizer = TfidfVectorizer(
        stop_words='english',
-        #min_df=0.001,
-        max_features=50000,
+        min_df=0.001,
+        # max_features=50000,
        ngram_range=(1, 4),
        token_pattern='[a-zA-Z0-9#]+', #remove user name, i.e term starting with @ for personnal data issue
    )
-    # logger.info("Compute TF-IDF on corpus = "+spatial_hiearchy)
    try:
        vectors = vectorizer.fit_transform(matrixAllTweets['tweet'])
        feature_names = vectorizer.get_feature_names()
@@ -812,7 +810,7 @@ def post_traitement_flood(biggest, logger, spatialLevel, ratio_of_flood=0.5):
 if __name__ == '__main__':
    # Workflow parameters :
    ## Rebuild H-TFIDF (with Matrix Occurence)
-    build_htfidf = True
+    build_htfidf = False
    ## eval 1 : Comparison with classical TF-IDf
    build_classical_tfidf = True
    ## evla 2 : Use word_embedding with t-SNE

--- a/LICENSE
+++ b/LICENSE
--- a/README.md
+++ b/README.md
@@ -48,3 +48,15 @@ This is based upon works of:
 * **Gaurav Shrivastava** who code FASTR algorithme in python. His script is in this repository

 NB : **Due to the size of this corpus, biotex could not be launched on the full corpus. It has to be splitt in 30k tweets. Then results have to be merged and ranked**
+
+## License
+This code is provided under the [CeCILL-B](https://cecill.info/licences/Licence_CeCILL-B_V1-en.html) free software license agreement.
+
+## Data Usage Agreement
+By using the [E.Echen's dataset](https://github.com/echen102/COVID-19-TweetIDs), as [stated by the author](https://github.com/echen102/COVID-19-TweetIDs#data-usage-agreement) you agree to abide by the stipulations in the license, remain in compliance with Twitter’s [Terms of Service](https://developer.twitter.com/en/developer-terms/agreement-and-policy), and cite the following manuscript: 
+
+Chen E, Lerman K, Ferrara E
+Tracking Social Media Discourse About the COVID-19 Pandemic: Development of a Public Coronavirus Twitter Data Set
+JMIR Public Health Surveillance 2020;6(2):e19273 
+DOI: 10.2196/19273 
+PMID: 32427106