propagation_gan/opencellid_parsing/003a.clean_data_with_ground...

52 lines
1.6 KiB
Python

#!/usr/bin/python
import os
import sys
import pickle
############################
# extract measurements
############################
folderpath = sys.argv[1]
pickle_fp_with_gt = "{}/all_with_groundtruth.pickle".format(folderpath)
cleaned_pickle_fp_with_gt = "{}/all_with_groundtruth_cleaned.pickle".format(folderpath)
cleaned_stat_fp_with_gt = "{}/all_with_groundtruth_cleaned.stat".format(folderpath)
if not os.path.isfile(pickle_fp_with_gt):
print("run 002 script first")
sys.exit(2)
data = pickle.load(open(pickle_fp_with_gt, 'rb'))
# clean data
cleaned_dist = {}
for key in data.keys():
length_before = len(data[key][1])
# remove duplicated entries
entries = []
for each in data[key][1]:
lon, lat, sig = each.split(',')[0:3]
entries.append((lon, lat, sig))
for i in range(len(entries)-1, 0, -1):
if entries[i] == entries[i-1]:
del entries[i]
del data[key][1][i]
length_after = len(data[key][1])
if length_after is 0:
del data[key]
else:
if length_after not in cleaned_dist:
cleaned_dist[length_after] = 0
cleaned_dist[length_after] += 1
# print("{}: after removal {} remained (original {})".format(key, length_after, length_before))
pickle.dump(data, open(cleaned_pickle_fp_with_gt, 'wb'))
# summarize number of measurements
cleaned_dist_pdf = sorted(cleaned_dist.items(), key=lambda x: x[0])
with open(cleaned_stat_fp_with_gt, 'w') as f:
total_count = 0
for num, count in cleaned_dist_pdf:
total_count += count
f.write("{},{},{}\n".format(num, count, total_count))