From f085380ad7ad70e48d541eb59ff0e4e812fb0987 Mon Sep 17 00:00:00 2001 From: HappyZ Date: Mon, 10 Jun 2019 11:55:09 -0500 Subject: [PATCH] still too slow to dump each file, separate pickle into three pickles instead --- .../002.extract_measurements.py | 76 ++++++++++--------- ...tract_only_enough_data_with_groundtruth.py | 55 ++++++++++++++ 2 files changed, 97 insertions(+), 34 deletions(-) create mode 100644 opencellid_parsing/003.extract_only_enough_data_with_groundtruth.py diff --git a/opencellid_parsing/002.extract_measurements.py b/opencellid_parsing/002.extract_measurements.py index 273c635..c88cf06 100644 --- a/opencellid_parsing/002.extract_measurements.py +++ b/opencellid_parsing/002.extract_measurements.py @@ -12,22 +12,13 @@ print("load ~55GB data into memory! do not try this if not on server machine") ############################ folderpath = sys.argv[1] all_pickle_fp = "{}/all.pickle".format(folderpath) - -outputfolder = folderpath.rstrip('/') + '_extracted' -if not os.path.isdir(outputfolder): - os.makedirs(outputfolder) - -headline = '#lon,lat,sig,measure_at,upload_at,rating,speed,direction' -headlines = { - 'UMTS': headline + ',rnc,cidshort,psc', - 'GSM': headline + ',timingadv', - 'LTE': headline + ',timingadv,tac,pci', - 'CDMA': headline + ',sid,nid,bid', -} +pickle_fp_with_gt = "{}/all_with_groundtruth.pickle".format(folderpath) +pickle_fp_no_gt = "{}/all_no_groundtruth.pickle".format(folderpath) +pickle_fp_unseen = "{}/all_unseen.pickle".format(folderpath) if os.path.isfile(all_pickle_fp): # load data only - data = pickle.load(open(all_pickle_fp), 'rb') + data = pickle.load(open(all_pickle_fp, 'rb')) else: # load and combine data files = [file for file in os.listdir(folderpath) if '_parsed.pickle' in file] @@ -45,24 +36,41 @@ else: pickle.dump(data, open(all_pickle_fp, 'wb')) -print("extracting each file to disk (only groundtruth we cared)..") -counter = 0 -for filepath in data: - if "with_groundtruth" not in filepath: - continue - if not os.path.isfile(filepath): - outputsubfolder = os.path.dirname(filepath) - if not os.path.isdir(outputsubfolder): - try: - os.makedirs(outputsubfolder) - except BaseException: - print("err: cannot create folder {}!!!!".format(outputsubfolder)) - continue - with open(filepath, 'w') as of: - of.write(headlines.get(data[filepath][0], headline) + '\n') - with open(filepath, 'a') as of: - for dataline in data[filepath][1]: - of.write(dataline + "\n") - counter += 1 - if counter % 1000 == 0: - print("progress: {} out of {} files extracted ({:.2f}%)".format(counter, len(data), 100.0 * counter / len(data))) +data_with_gt = {} +data_no_gt = {} +data_unseen = {} + +for key in data: + if "with_groundtruth" in key: + data_with_gt[key] = data[key] + elif "without_groundtruth" in key: + data_no_gt[key] = data[key] + elif "unseen" in key: + data_unseen[key] = data[key] + +pickle.dump(data_with_gt, open(pickle_fp_with_gt, 'wb')) +pickle.dump(data_no_gt, open(pickle_no_gt, 'wb')) +pickle.dump(data_unseen, open(pickle_fp_unseen, 'wb')) + + +# print("extracting each file to disk (only groundtruth we cared)..") +# counter = 0 +# for filepath in data: +# if "with_groundtruth" not in filepath: +# continue +# if not os.path.isfile(filepath): +# outputsubfolder = os.path.dirname(filepath) +# if not os.path.isdir(outputsubfolder): +# try: +# os.makedirs(outputsubfolder) +# except BaseException: +# print("err: cannot create folder {}!!!!".format(outputsubfolder)) +# continue +# with open(filepath, 'w') as of: +# of.write(headlines.get(data[filepath][0], headline) + '\n') +# with open(filepath, 'a') as of: +# for dataline in data[filepath][1]: +# of.write(dataline + "\n") +# counter += 1 +# if counter % 1000 == 0: +# print("progress: {} out of {} files extracted ({:.2f}%)".format(counter, len(data), 100.0 * counter / len(data))) diff --git a/opencellid_parsing/003.extract_only_enough_data_with_groundtruth.py b/opencellid_parsing/003.extract_only_enough_data_with_groundtruth.py new file mode 100644 index 0000000..7cb41b7 --- /dev/null +++ b/opencellid_parsing/003.extract_only_enough_data_with_groundtruth.py @@ -0,0 +1,55 @@ +#!/usr/bin/python + +import os +import sys +import pickle + +outputfolder = folderpath.rstrip('/') + '_extracted' +if not os.path.isdir(outputfolder): + os.makedirs(outputfolder) + +headline = '#lon,lat,sig,measure_at,upload_at,rating,speed,direction' +headlines = { + 'UMTS': headline + ',rnc,cidshort,psc', + 'GSM': headline + ',timingadv', + 'LTE': headline + ',timingadv,tac,pci', + 'CDMA': headline + ',sid,nid,bid', +} + +############################ +# extract measurements +############################ +folderpath = sys.argv[1] +pickle_fp_with_gt = "{}/all_with_groundtruth.pickle".format(folderpath) + +if not os.path.isfile(pickle_fp_with_gt): + print("run 002 script first") + sys.exit(2) + +data = pickle.load(open(pickle_fp_with_gt, 'rb')) + +print("extracting each file to disk (only groundtruth we cared)..") +counter = 0 +counter_valid = 0 +for filepath in data: + counter += 1 + if counter % 1000 == 0: + print("progress: {} out of {} files extracted ({:.2f}%)".format(counter, len(data), 100.0 * counter / len(data))) + if len(data[filepath][1]) < 10: + print("{}: entry less than 10, skipping".format(filepath)) + continue + counter_valid += 1 + if not os.path.isfile(filepath): + outputsubfolder = os.path.dirname(filepath) + if not os.path.isdir(outputsubfolder): + try: + os.makedirs(outputsubfolder) + except BaseException: + print("err: cannot create folder {}!!!!".format(outputsubfolder)) + continue + with open(filepath, 'w') as of: + of.write(headlines.get(data[filepath][0], headline) + '\n') + with open(filepath, 'a') as of: + for dataline in data[filepath][1]: + of.write(dataline + "\n") +print("done: {} out of {} files extracted ({:.2f}%)".format(counter_valid, len(data), 100.0 * counter_valid / len(data)))