Commit 57ff00b8 authored by Rémy Decoupes's avatar Rémy Decoupes
Browse files

merge mood server with dev

parents cf6b4066 90d166d6
......@@ -20,3 +20,4 @@ elasticsearch/log/
elasticsearch/logstash/sincedb.log
elasticsearch/logfix_bad_quote_json.log
elasticsearch/src/figs/
/elasticsearch/analysis-output/
......@@ -34,9 +34,24 @@ python3 collectTweets.py
2. Modify `/lib/systemd/system/mood-tweets-collect.service with the correct `WorkingDirectory` and `User
3. Reload systemd :`sudo systemctl daemon-reload`
4. Enable auto start using command : `sudo systemctl enable mood-tweets-collect.service`
## Optional: Index tweets and explore data with ELK stack:
## Optional analysis
### Index tweets and explore data with ELK stack:
<img src="https://images.contentstack.io/v3/assets/bltefdd0b53724fa2ce/blt280217a63b82a734/5bbdaacf63ed239936a7dd56/elastic-logo.svg" alt="elastic" width="200">
See [documentation](elasticsearch/indexation.md)
### Visualize Retweet mechanism thank to gephi
<img src="https://gephi.org/images/logo.png" alt="gephi">
1. Install gephi with snap: `snap install gephi`
2. Export 2 files :
1. Edge with `["ID", "Source", "Target", "timeset"]`. Be careful name of columns are very important otherwise Gephi won't recognize it. Timeset is not mandatory: use it if you want dynamic graph
2. Nodes with `["ID", Label"]`
3. Import both files with gephi
4. Change the spatial layout (force atlas 2 for example)
5. Change size of nodes corresponding to their importance (degree in and out)
6. Change color of nodes according to their community:
1. Apply a community detection algo: we use modularity (from gephi statisic: cluster nodes when they share a lot of link and less much with nodes out of their communauty)
2. On Aspect, choose partition and select the modularity class that they have just been created
## License
This code is provided under the [CeCILL-B](https://cecill.info/licences/Licence_CeCILL-B_V1-en.html) free software license agreement.
......@@ -12,8 +12,8 @@
"user.location",
"entities.media.expanded_url",
"entities.urls.expanded_url",
"user.name",
"retweeted_status.user.name",
"user.id",
"retweeted_status.user.id",
"rest.*"
],
"_source": false,
......
......@@ -5,6 +5,8 @@
Download metric from Elasticsearch to easily compute TF-IDF matric
"""
import hashlib
from elasticsearch import Elasticsearch
from jinja2 import FileSystemLoader, Environment
import os
......@@ -105,6 +107,8 @@ def tf_idf(list_of_docs, lang='english', nb_top_score=1000):
return tf_idf_top_score
if __name__ == '__main__':
# print working directory:
print("working directory is: " + str(os.getcwd()))
# Filter
state = 'aquitaine'
start_date = "2020-12-01T00:00:00.172Z"
......@@ -137,13 +141,16 @@ if __name__ == '__main__':
df_tfidf = tf_idf(df_results["text"].tolist())
df_tfidf["tf_idf_terms"] = df_tfidf.index
df_tfidf = df_tfidf.merge(df_results, on="index_of_tweet")
# prepare to Gephi for graph vizu
gephi = df_results
gephi["Source"] = gephi["user.name"]
gephi["Target"] = gephi["retweeted_status.user.name"]
gephi["ID"] = gephi.index
gephi["Timestamp"] = gephi["@timestamp"]
gephi["label"] = gephi["retweeted_status.user.name"]
# prepare to Gephi for graph vizu: Graph bipartites. Nodes are Newspaper and TF-IDf
news_paper_name = pd.read_csv("./../params/accountsFollowed.csv") # get account (followed by MOOD) names
news_paper_name["retweeted_status.user.id"] = news_paper_name["twitterID"] # prepare for merge
gephi = df_tfidf
# gephi["Source"] = gephi["user.id"].apply(lambda x: hashlib.md5(str(x).encode()).hexdigest()) # pseudonimization
gephi["Source"] = gephi.index # id du lien
gephi["Target"] = gephi["retweeted_status.user.id"]
gephi["Id"] = gephi.index
gephi["Label"] = gephi["tf_idf_terms"]
gephi["timeset"] = gephi["@timestamp"]
gephi = gephi[gephi["Target"].str.len() !=0] # filter out tweet that are not retweeted
gephi[["ID", "label", "Source", "Target", "Timestamp"]].to_csv(
"analysis-output/acquitaine_script_gephi.csv",
......@@ -180,7 +187,21 @@ if __name__ == '__main__':
df_tfidf = df_tfidf.join(classifier_df)
df_tfidf.to_csv("analysis-output/acquitaine-digitalepidemiologylab.csv")
df_tfidf.to_pickle("analysis-output/acquitaine-digitalepidemiologylab.pkl")
gephi[["Id", "Label", "Source", "Target", "timeset"]].to_csv(
"analysis-output/acquitaine_script_gephi_edge.csv",
index=False
)
gephi.to_csv("analysis-output/gephi-debug.csv")
# Node: newspapers (MOOD account followed)
gephi_node = pd.DataFrame(gephi["retweeted_status.user.id"].unique(), columns=["retweeted_status.user.id"])
gephi_node["Label"] = gephi_node.merge(news_paper_name, on="retweeted_status.user.id")["account"]
gephi_node["Id"] = gephi_node["retweeted_status.user.id"]
# Node: TF-IDF
gephi_node_sub = pd.DataFrame(gephi["tf_idf_terms"].unique(), columns=["tf_idf_terms"])
gephi_node_sub["Id"] = gephi_node_sub.merge(gephi, on="tf_idf_terms")["Id"]
gephi_node_sub = gephi_node_sub.rename(columns={"tf_idf_terms": "Label", "Id": "Id"})
gephi_node = gephi_node.append(gephi_node_sub)
gephi_node.to_csv(
"analysis-output/acquitaine_script_gephi_node.csv",
index=False
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment