propagation_gan/opencellid_parsing/002.extract_measurements.py

77 lines
2.5 KiB
Python

#!/usr/bin/python
import os
import sys
import pickle
print("load ~55GB data into memory! do not try this if not on server machine")
############################
# extract measurements
############################
folderpath = sys.argv[1]
all_pickle_fp = "{}/all.pickle".format(folderpath)
pickle_fp_with_gt = "{}/all_with_groundtruth.pickle".format(folderpath)
pickle_fp_no_gt = "{}/all_no_groundtruth.pickle".format(folderpath)
pickle_fp_unseen = "{}/all_unseen.pickle".format(folderpath)
if os.path.isfile(all_pickle_fp):
# load data only
data = pickle.load(open(all_pickle_fp, 'rb'))
else:
# load and combine data
files = [file for file in os.listdir(folderpath) if '_parsed.pickle' in file]
data = {}
for file in sorted(files, key=lambda x: int(x.rstrip('_parsed.pickle').split('_')[-1])):
print("loading {}".format(file))
datatmp = pickle.load(open("{}/{}".format(folderpath, file), 'rb'))
print("combining..")
for key in datatmp:
if key not in data:
data[key] = datatmp[key]
else:
data[key][1] += datatmp[key][1]
pickle.dump(data, open(all_pickle_fp, 'wb'))
data_with_gt = {}
data_no_gt = {}
data_unseen = {}
for key in data:
if "with_groundtruth" in key:
data_with_gt[key] = data[key]
elif "without_groundtruth" in key:
data_no_gt[key] = data[key]
elif "unseen" in key:
data_unseen[key] = data[key]
pickle.dump(data_with_gt, open(pickle_fp_with_gt, 'wb'))
pickle.dump(data_no_gt, open(pickle_no_gt, 'wb'))
pickle.dump(data_unseen, open(pickle_fp_unseen, 'wb'))
# print("extracting each file to disk (only groundtruth we cared)..")
# counter = 0
# for filepath in data:
# if "with_groundtruth" not in filepath:
# continue
# if not os.path.isfile(filepath):
# outputsubfolder = os.path.dirname(filepath)
# if not os.path.isdir(outputsubfolder):
# try:
# os.makedirs(outputsubfolder)
# except BaseException:
# print("err: cannot create folder {}!!!!".format(outputsubfolder))
# continue
# with open(filepath, 'w') as of:
# of.write(headlines.get(data[filepath][0], headline) + '\n')
# with open(filepath, 'a') as of:
# for dataline in data[filepath][1]:
# of.write(dataline + "\n")
# counter += 1
# if counter % 1000 == 0:
# print("progress: {} out of {} files extracted ({:.2f}%)".format(counter, len(data), 100.0 * counter / len(data)))