diff --git a/opencellid_parsing/001.data_extraction.py b/opencellid_parsing/001.parse_measurements.py similarity index 89% rename from opencellid_parsing/001.data_extraction.py rename to opencellid_parsing/001.parse_measurements.py index 7d4b79a..618e047 100644 --- a/opencellid_parsing/001.data_extraction.py +++ b/opencellid_parsing/001.parse_measurements.py @@ -92,28 +92,12 @@ else: files = [file for file in os.listdir(folderpath) if '.csv' in file] -for file in sorted(files, key=lambda x: x.rstrip('.csv').split('_')[-1]): +for file in sorted(files, key=lambda x: int(x.rstrip('.csv').split('_')[-1])): print("looking at file: {}".format(file)) pickle_file = file.replace('.csv', '_parsed.pickle') - if os.path.isfile("{}/{}".format(folderpath, pickle_file)): - print("found the pickle, loading directly..") - mybufferdict = pickle.load(open("{}/{}".format(folderpath, pickle_file), 'rb')) - - print("extracting each file to disk..") - counter = 0 - for filepath in mybufferdict: - if not os.path.isfile(filepath): - with open(filepath, 'w') as of: - of.write(headlines.get(mybufferdict[filepath][0], headline) + '\n') - with open(filepath, 'a') as of: - for dataline in mybufferdict[filepath][1]: - of.write(dataline + "\n") - counter += 1 - if counter % 1000 == 0: - print("progress: {} out of {} extracted ({:.2f}%)".format(counter, len(mybufferdict), 100.0 * counter / len(mybufferdict))) - else: + if not os.path.isfile("{}/{}".format(folderpath, pickle_file)): print("parsing and loading in memory..") with open("{}/{}".format(folderpath, file), 'r') as f: lines = f.readlines() @@ -198,11 +182,6 @@ for file in sorted(files, key=lambda x: x.rstrip('.csv').split('_')[-1]): else: mydict = {} outputsubfolder = "{}/unseen/{}/{}/{}/{}".format(outputfolder, radio, mcc, net, area) - try: - if not os.path.isdir(outputsubfolder): - os.makedirs(outputsubfolder) - except BaseException: - raise gt_loc = mydict.get(key, {}).get('location', (float('nan'), float('nan'))) filepath = "{}/{}_{:.6f}_{:.6f}.csv".format(outputsubfolder, cell, gt_loc[0], gt_loc[1]) if filepath not in mybufferdict: diff --git a/opencellid_parsing/002.extract_measurements.py b/opencellid_parsing/002.extract_measurements.py new file mode 100644 index 0000000..0715439 --- /dev/null +++ b/opencellid_parsing/002.extract_measurements.py @@ -0,0 +1,68 @@ +#!/usr/bin/python + +import os +import sys +import pickle + +print("load ~55GB data into memory! do not try this if not on server machine") + + +############################ +# extract measurements +############################ +folderpath = sys.argv[1] +all_pickle_fp = "{}/all.pickle".format(folderpath) + +outputfolder = folderpath.rstrip('/') + '_extracted' +if not os.path.isdir(outputfolder): + os.makedirs(outputfolder) + +headline = '#lon,lat,sig,measure_at,upload_at,rating,speed,direction' +headlines = { + 'UMTS': headline + ',rnc,cidshort,psc', + 'GSM': headline + ',timingadv', + 'LTE': headline + ',timingadv,tac,pci', + 'CDMA': headline + ',sid,nid,bid', +} + +if os.path.isfile(all_pickle_fp): + # load data only + data = pickle.load(open(all_pickle_fp), 'rb') +else: + # load and combine data + files = [file for file in os.listdir(folderpath) if '_parsed.pickle' in file] + data = {} + for file in sorted(files, key=lambda x: int(x.rstrip('_parsed.pickle').split('_')[-1])): + print("loading {}".format(file)) + datatmp = pickle.load(open("{}/{}".format(folderpath, file), 'rb')) + print("combining..") + for key in datatmp: + if key not in data: + data[key] = datatmp[key] + else: + data[key][1] += datatmp[key][1] + + pickle.dump(data, open(all_pickle_fp), 'wb') + + +print("extracting each file to disk (only groundtruth we cared)..") +counter = 0 +for filepath in data: + if "with_groundtruth" not in filepath: + continue + if not os.path.isfile(filepath): + outputsubfolder = os.path.dirname(filepath) + if not os.path.isdir(outputsubfolder): + try: + os.makedirs(outputsubfolder) + except BaseException: + print("err: cannot create folder {}!!!!".format(outputsubfolder)) + continue + with open(filepath, 'w') as of: + of.write(headlines.get(data[filepath][0], headline) + '\n') + with open(filepath, 'a') as of: + for dataline in data[filepath][1]: + of.write(dataline + "\n") + counter += 1 + if counter % 1000 == 0: + print("progress: {} out of {} files extracted ({:.2f}%)".format(counter, len(data), 100.0 * counter / len(data)))