"""
@brief: modd-tetis-tweets-collect create jsonl with bad quote (single instead of double quote)
@author: R.Decoupes but largely inspired by https://gist.github.com/mbrzusto/23fe728966247f25f3ec
@copyright CeCILL-B

Browser ouput dir. For each file, check if the fix have been already applied (we compare the number between output and output.doublequote), if not: create a file in output.doublequote
"""

import json
import ast
import os
import logging
from logging.handlers import RotatingFileHandler

path_dir_in = "/home/rdecoupe/mood-tetis-tweets-collect/output"
path_dir_out = "/home/rdecoupe/mood-tetis-tweets-collect/output.doublequote"
path_log = "/home/rdecoupe/mood-tetis-tweets-collect/elasticsearch/log/fix_bad_quote_json"

# logger
def logsetup():
	"""
	Initiate a logger object :
		- Log in file : collectweets.log
		- also print on screen
	:return: logger object
	"""
	logger = logging.getLogger()
	logger.setLevel(logging.DEBUG)
	formatter = logging.Formatter('%(asctime)s :: %(levelname)s :: %(message)s')
	file_handler = RotatingFileHandler(path_log + '/fix_bad_quote_json.log', 'a', 1000000, 1)
	file_handler.setLevel(logging.DEBUG)
	file_handler.setFormatter(formatter)
	logger.addHandler(file_handler)
	stream_handler = logging.StreamHandler()
	# Only display on screen INFO
	stream_handler.setLevel(logging.INFO)
	logger.addHandler(stream_handler)
	return logger

logger = logsetup()
logger.info("Transform jsonl single quotes into double quotes")

for root, dirs, files in os.walk(path_dir_in):
	for name in files:
		fr = open(path_dir_in + "/" + name)
		fw = open(path_dir_out + "/" + name)
		nb_lines_in = sum(1 for line in fr)
		try:
			nb_lines_out = sum(1 for line in fw)
		except: #file is empty
			nb_lines_out = 0
		logger.info("file: " + name + " in: "+ str(nb_lines_in) + " and out:" + str(nb_lines_out))
		if nb_lines_in != nb_lines_out:
			fr.seek(0) # go to the start of the file
			fw = open(path_dir_out + "/" + name, "w")
			for line in fr:
				json_dat = json.dumps(ast.literal_eval(line))
				dict_dat = json.loads(json_dat)
				json.dump(dict_dat, fw)
				fw.write("\n")
				nb_lines_out = nb_lines_out + 1
			logger.info(name + ": number of tweets: " + str(nb_lines_out))
		else:
			logger.info(name + ": has been already processing during a previous run")
		fw.close()
		fr.close()

logger.info("run successful")