Commit 2132c4ec authored by Rémy Decoupes's avatar Rémy Decoupes
Browse files

fix json double quote recursively

parent 1a1fa27d
......@@ -17,3 +17,4 @@ __pycache__*
elasticsearch/data/
elasticsearch/log/
elasticsearch/logstash/sincedb.log
elasticsearch/logfix_bad_quote_json.log
......@@ -3,8 +3,8 @@ input{
type => "json"
path => "/home/rdecoupe/mood-tetis-tweets-collect/output.doublequote/*.jsonl"
codec => "json"
sincedb_path => "/dev/null"
# sincedb_path => "/home/rdecoupe/mood-tetis-tweets-collect/elasticsearch/logstash/sincedb.log"
# sincedb_path => "/dev/null"
sincedb_path => "/home/rdecoupe/mood-tetis-tweets-collect/elasticsearch/logstash/sincedb.log"
start_position => "beginning"
}
}
......
# from https://gist.github.com/mbrzusto/23fe728966247f25f3ec
"""
@brief: modd-tetis-tweets-collect create jsonl with bad quote (single instead of double quote)
@author: R.Decoupes but largely inspired by https://gist.github.com/mbrzusto/23fe728966247f25f3ec
@copyright CeCILL-B
Browser ouput dir. For each file, check if the fix have been already applied (we compare the number between output and output.doublequote), if not: create a file in output.doublequote
"""
import json
import ast
import os
import logging
from logging.handlers import RotatingFileHandler
path_dir_in = "/home/rdecoupe/mood-tetis-tweets-collect/output"
path_dir_out = "/home/rdecoupe/mood-tetis-tweets-collect/output.doublequote"
path_log = "/home/rdecoupe/mood-tetis-tweets-collect/elasticsearch/log/fix_bad_quote_json"
# logger
def logsetup():
"""
Initiate a logger object :
- Log in file : collectweets.log
- also print on screen
:return: logger object
"""
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s :: %(levelname)s :: %(message)s')
file_handler = RotatingFileHandler(path_log + '/fix_bad_quote_json.log', 'a', 1000000, 1)
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
stream_handler = logging.StreamHandler()
# Only display on screen INFO
stream_handler.setLevel(logging.INFO)
logger.addHandler(stream_handler)
return logger
fr = open("/home/rdecoupe/mood-tetis-tweets-collect/output/tweetoutput20200622-093342.jsonl")
fw = open("/home/rdecoupe/mood-tetis-tweets-collect/output.doublequote/tweetoutput20200622-093342.jsonl", "w")
logger = logsetup()
logger.info("Transform jsonl single quotes into double quotes")
for line in fr:
json_dat = json.dumps(ast.literal_eval(line))
dict_dat = json.loads(json_dat)
json.dump(dict_dat, fw)
fw.write("\n")
for root, dirs, files in os.walk(path_dir_in):
for name in files:
fr = open(path_dir_in + "/" + name)
fw = open(path_dir_out + "/" + name, "w")
nb_lines_in = sum(1 for line in fr)
try:
nb_lines_out = sum(1 for line in fw)
except: #file is empty
nb_lines_out = 0
if nb_lines_in != nb_lines_out:
fr.seek(0) # go to the start of the file
for line in fr:
json_dat = json.dumps(ast.literal_eval(line))
dict_dat = json.loads(json_dat)
json.dump(dict_dat, fw)
fw.write("\n")
nb_lines_out = nb_lines_out + 1
logger.info(name + ": number of tweets: " + str(nb_lines_out))
else:
logger.info(name + ": has been already processing during a previous run")
fw.close()
fr.close()
fw.close()
fr.close()
logger.info("run successful")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment