Commit 292b149b authored by Rémy Decoupes's avatar Rémy Decoupes
Browse files

implement "AND" on Twitter query

parent 888efcd5
......@@ -54,15 +54,38 @@ def logsetup():
class Listener(tweepy.StreamListener):
def __init__(self, output_file=sys.stdout, logger=sys.stdout):
def __init__(self, output_file=sys.stdout, logger=sys.stdout, keywords=[]):
super(Listener, self).__init__()
self.output_file = output_file
self.logger = logger
self.logger.info("initiate stream listener")
self.keywords = keywords
def on_status(self, status):
"""
Here a list of attributs/properties of status object :
['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__',
'__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__',
'__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__',
'__str__', '__subclasshook__', '__weakref__', '_api', '_json', 'author', 'contributors', 'coordinates',
'created_at', 'destroy', 'entities', 'favorite', 'favorite_count', 'favorited', 'filter_level', 'geo', 'id',
'id_str', 'in_reply_to_screen_name', 'in_reply_to_status_id', 'in_reply_to_status_id_str','in_reply_to_user_id',
'in_reply_to_user_id_str', 'is_quote_status', 'lang', 'parse', 'parse_list', 'place', 'possibly_sensitive',
'quote_count', 'reply_count', 'retweet', 'retweet_count', 'retweeted', 'retweeted_status', 'retweets','source',
'source_url', 'text', 'timestamp_ms', 'truncated', 'user']
More information could be find on Tweepy's github :
https://github.com/tweepy/tweepy/blob/master/tweepy/streaming.py
:param status:
:return:
"""
try:
print(status._json, file=self.output_file)
# Get tweets only is they match with a list of keywords (on a plain-text and on hasthag)
if any(keyword in str(status._json) for keyword in self.keywords):
print(status._json, file=self.output_file)
# print status object properties
# print(dir(status))
except:
msg = "can not save tweets in file"+str(self.output_file)
exitscript(logger, msg)
......@@ -77,6 +100,12 @@ class Listener(tweepy.StreamListener):
if __name__ == '__main__':
"""
Create a jsonl file which contains json lines of tweets.
This script filter Twitter stream by a list of account followed and then only keeps tweets which match with
a list of words.
Both account and list of keywords are defined by MOOD project.
"""
# initialize a logger :
logger = logsetup()
logger.info("Collect tweets : start")
......@@ -107,19 +136,21 @@ if __name__ == '__main__':
accounttofollowed = pd.read_csv("params/accountsFollowed.csv")
accounttofollowedlist = list(map(str,accounttofollowed['twitterID'].tolist()))
# Get hashtag to track
hashtagtracked = pd.read_csv("params/keywordsFilter.csv")
hashtagtrackedList = list(map(str, hashtagtracked['hashtags'].tolist()))
# Get keyword to track
keyword = pd.read_csv("params/keywordsFilter.csv")
keywordList = list(map(str, keyword['hashtags'].tolist()))
# Start a Twitter stream
timestr = time.strftime("%Y%m%d-%H%M%S")
tweetouputfilename = "output/tweetoutput"+timestr+".jsonl"
tweetoutput = open(tweetouputfilename, 'w')
myStreamListener = Listener(tweetoutput, logger)
myStreamListener = Listener(tweetoutput, logger, keywordList)
stream = tweepy.Stream(auth=api.auth, listener=myStreamListener)
try:
logger.info("Start streaming")
stream.filter(follow=accounttofollowedlist, track=hashtagtrackedList)
# Could not filter on both ("AND") account AND hashtag because Twitter's API query only implements "OR"
# stream.filter(follow=accounttofollowedlist, track=hashtagtrackedList)
stream.filter(follow=accounttofollowedlist)
except KeyboardInterrupt as e:
logger.info("Stream Keyboard Interrupt")
finally:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment