#!/usr/bin/python import os import sys import pickle print("load ~55GB data into memory! do not try this if not on server machine") ############################ # extract measurements ############################ folderpath = sys.argv[1] all_pickle_fp = "{}/all.pickle".format(folderpath) outputfolder = folderpath.rstrip('/') + '_extracted' if not os.path.isdir(outputfolder): os.makedirs(outputfolder) headline = '#lon,lat,sig,measure_at,upload_at,rating,speed,direction' headlines = { 'UMTS': headline + ',rnc,cidshort,psc', 'GSM': headline + ',timingadv', 'LTE': headline + ',timingadv,tac,pci', 'CDMA': headline + ',sid,nid,bid', } if os.path.isfile(all_pickle_fp): # load data only data = pickle.load(open(all_pickle_fp), 'rb') else: # load and combine data files = [file for file in os.listdir(folderpath) if '_parsed.pickle' in file] data = {} for file in sorted(files, key=lambda x: int(x.rstrip('_parsed.pickle').split('_')[-1])): print("loading {}".format(file)) datatmp = pickle.load(open("{}/{}".format(folderpath, file), 'rb')) print("combining..") for key in datatmp: if key not in data: data[key] = datatmp[key] else: data[key][1] += datatmp[key][1] pickle.dump(data, open(all_pickle_fp), 'wb') print("extracting each file to disk (only groundtruth we cared)..") counter = 0 for filepath in data: if "with_groundtruth" not in filepath: continue if not os.path.isfile(filepath): outputsubfolder = os.path.dirname(filepath) if not os.path.isdir(outputsubfolder): try: os.makedirs(outputsubfolder) except BaseException: print("err: cannot create folder {}!!!!".format(outputsubfolder)) continue with open(filepath, 'w') as of: of.write(headlines.get(data[filepath][0], headline) + '\n') with open(filepath, 'a') as of: for dataline in data[filepath][1]: of.write(dataline + "\n") counter += 1 if counter % 1000 == 0: print("progress: {} out of {} files extracted ({:.2f}%)".format(counter, len(data), 100.0 * counter / len(data)))