""" @brief: modd-tetis-tweets-collect create jsonl with bad quote (single instead of double quote) @author: R.Decoupes but largely inspired by https://gist.github.com/mbrzusto/23fe728966247f25f3ec @copyright CeCILL-B Browser ouput dir. For each file, check if the fix have been already applied (we compare the number between output and output.doublequote), if not: create a file in output.doublequote """ import json import ast import os import logging from logging.handlers import RotatingFileHandler path_dir_in = "/home/rdecoupe/mood-tetis-tweets-collect/output" path_dir_out = "/home/rdecoupe/mood-tetis-tweets-collect/output.doublequote" path_log = "/home/rdecoupe/mood-tetis-tweets-collect/elasticsearch/log/fix_bad_quote_json" # logger def logsetup(): """ Initiate a logger object : - Log in file : collectweets.log - also print on screen :return: logger object """ logger = logging.getLogger() logger.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s :: %(levelname)s :: %(message)s') file_handler = RotatingFileHandler(path_log + '/fix_bad_quote_json.log', 'a', 1000000, 1) file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(formatter) logger.addHandler(file_handler) stream_handler = logging.StreamHandler() # Only display on screen INFO stream_handler.setLevel(logging.INFO) logger.addHandler(stream_handler) return logger logger = logsetup() logger.info("Transform jsonl single quotes into double quotes") for root, dirs, files in os.walk(path_dir_in): for name in files: fr = open(path_dir_in + "/" + name) fw = open(path_dir_out + "/" + name) nb_lines_in = sum(1 for line in fr) try: nb_lines_out = sum(1 for line in fw) except: #file is empty nb_lines_out = 0 logger.info("file: " + name + " in: "+ str(nb_lines_in) + " and out:" + str(nb_lines_out)) if nb_lines_in != nb_lines_out: fr.seek(0) # go to the start of the file fw = open(path_dir_out + "/" + name, "w") for line in fr: json_dat = json.dumps(ast.literal_eval(line)) dict_dat = json.loads(json_dat) json.dump(dict_dat, fw) fw.write("\n") nb_lines_out = nb_lines_out + 1 logger.info(name + ": number of tweets: " + str(nb_lines_out)) else: logger.info(name + ": has been already processing during a previous run") fw.close() fr.close() logger.info("run successful")