fix_bad_quote_json.py 2.3 KB
Newer Older
1
2
3
4
5
6
7
"""
@brief: modd-tetis-tweets-collect create jsonl with bad quote (single instead of double quote)
@author: R.Decoupes but largely inspired by https://gist.github.com/mbrzusto/23fe728966247f25f3ec
@copyright CeCILL-B

Browser ouput dir. For each file, check if the fix have been already applied (we compare the number between output and output.doublequote), if not: create a file in output.doublequote
"""
8
9
10

import json
import ast
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import os
import logging
from logging.handlers import RotatingFileHandler

path_dir_in = "/home/rdecoupe/mood-tetis-tweets-collect/output"
path_dir_out = "/home/rdecoupe/mood-tetis-tweets-collect/output.doublequote"
path_log = "/home/rdecoupe/mood-tetis-tweets-collect/elasticsearch/log/fix_bad_quote_json"

# logger
def logsetup():
	"""
	Initiate a logger object :
		- Log in file : collectweets.log
		- also print on screen
	:return: logger object
	"""
	logger = logging.getLogger()
	logger.setLevel(logging.DEBUG)
	formatter = logging.Formatter('%(asctime)s :: %(levelname)s :: %(message)s')
	file_handler = RotatingFileHandler(path_log + '/fix_bad_quote_json.log', 'a', 1000000, 1)
	file_handler.setLevel(logging.DEBUG)
	file_handler.setFormatter(formatter)
	logger.addHandler(file_handler)
	stream_handler = logging.StreamHandler()
	# Only display on screen INFO
	stream_handler.setLevel(logging.INFO)
	logger.addHandler(stream_handler)
	return logger
39

40
41
logger = logsetup()
logger.info("Transform jsonl single quotes into double quotes")
42

43
44
45
for root, dirs, files in os.walk(path_dir_in):
	for name in files:
		fr = open(path_dir_in + "/" + name)
46
		fw = open(path_dir_out + "/" + name)
47
48
49
50
51
		nb_lines_in = sum(1 for line in fr)
		try:
			nb_lines_out = sum(1 for line in fw)
		except: #file is empty
			nb_lines_out = 0
52
		logger.info("file: " + name + " in: "+ str(nb_lines_in) + " and out:" + str(nb_lines_out))
53
54
		if nb_lines_in != nb_lines_out:
			fr.seek(0) # go to the start of the file
55
			fw = open(path_dir_out + "/" + name, "w")
56
57
58
59
60
61
62
63
64
65
66
			for line in fr:
				json_dat = json.dumps(ast.literal_eval(line))
				dict_dat = json.loads(json_dat)
				json.dump(dict_dat, fw)
				fw.write("\n")
				nb_lines_out = nb_lines_out + 1
			logger.info(name + ": number of tweets: " + str(nb_lines_out))
		else:
			logger.info(name + ": has been already processing during a previous run")
		fw.close()
		fr.close()
67

68
logger.info("run successful")
69