diff --git a/bin/dirty-db-cleaner.py b/bin/dirty-db-cleaner.py new file mode 100755 index 000000000..8ed9c5065 --- /dev/null +++ b/bin/dirty-db-cleaner.py @@ -0,0 +1,45 @@ +#!/usr/bin/python -u +# +# Created by Bjarni R. Einarsson, placed in the public domain. Go wild! +# +import json +import os +import sys + +try: + dirtydb_input = sys.argv[1] + dirtydb_output = '%s.new' % dirtydb_input + assert(os.path.exists(dirtydb_input)) + assert(not os.path.exists(dirtydb_output)) +except: + print + print 'Usage: %s /path/to/dirty.db' % sys.argv[0] + print + print 'Note: Will create a file named dirty.db.new in the same folder,' + print ' please make sure permissions are OK and a file by that' + print ' name does not exist already. This script works by omitting' + print ' duplicate lines from the dirty.db file, keeping only the' + print ' last (latest) instance. No revision data should be lost,' + print ' but be careful, make backups. If it breaks you get to keep' + print ' both pieces!' + print + sys.exit(1) + +dirtydb = {} +lines = 0 +with open(dirtydb_input, 'r') as fd: + print 'Reading %s' % dirtydb_input + for line in fd: + lines += 1 + data = json.loads(line) + dirtydb[data['key']] = line + if lines % 10000 == 0: + sys.stderr.write('.') +print +print 'OK, found %d unique keys in %d lines' % (len(dirtydb), lines) + +with open(dirtydb_output, 'w') as fd: + for data in dirtydb.values(): + fd.write(data) + +print 'Wrote data to %s. All done!' % dirtydb_output