separate parse and extract data into two files, bug fix

2019-06-10 11:07:51 -05:00 · 2019-06-10 11:07:51 -05:00 · d579a7d122
parent ab5c0c3f55
commit d579a7d122
2 changed files with 70 additions and 23 deletions
--- a/opencellid_parsing/001.parse_measurements.py
+++ b/opencellid_parsing/001.parse_measurements.py
@ -92,28 +92,12 @@ else:
 files = [file for file in os.listdir(folderpath) if '.csv' in file]
-for file in sorted(files, key=lambda x: x.rstrip('.csv').split('_')[-1]):
+for file in sorted(files, key=lambda x: int(x.rstrip('.csv').split('_')[-1])):
    print("looking at file: {}".format(file))
    pickle_file = file.replace('.csv', '_parsed.pickle')
-    if os.path.isfile("{}/{}".format(folderpath, pickle_file)):
+    if not os.path.isfile("{}/{}".format(folderpath, pickle_file)):
        print("found the pickle, loading directly..")
        mybufferdict = pickle.load(open("{}/{}".format(folderpath, pickle_file), 'rb'))
        print("extracting each file to disk..")
        counter = 0
        for filepath in mybufferdict:
            if not os.path.isfile(filepath):
                with open(filepath, 'w') as of:
                    of.write(headlines.get(mybufferdict[filepath][0], headline) + '\n')
            with open(filepath, 'a') as of:
                for dataline in mybufferdict[filepath][1]:
                    of.write(dataline + "\n")
            counter += 1
            if counter % 1000 == 0:
                print("progress: {} out of {} extracted ({:.2f}%)".format(counter, len(mybufferdict), 100.0 * counter / len(mybufferdict)))
    else:
        print("parsing and loading in memory..")
        with open("{}/{}".format(folderpath, file), 'r') as f:
            lines = f.readlines()
@ -198,11 +182,6 @@ for file in sorted(files, key=lambda x: x.rstrip('.csv').split('_')[-1]):
            else:
                mydict = {}
                outputsubfolder = "{}/unseen/{}/{}/{}/{}".format(outputfolder, radio, mcc, net, area)
            try:
                if not os.path.isdir(outputsubfolder):
                    os.makedirs(outputsubfolder)
            except BaseException:
                raise
            gt_loc = mydict.get(key, {}).get('location', (float('nan'), float('nan')))
            filepath = "{}/{}_{:.6f}_{:.6f}.csv".format(outputsubfolder, cell, gt_loc[0], gt_loc[1])
            if filepath not in mybufferdict:
--- a/opencellid_parsing/002.extract_measurements.py
+++ b/opencellid_parsing/002.extract_measurements.py
@ -0,0 +1,68 @@
 #!/usr/bin/python
 import os
 import sys
 import pickle
 print("load ~55GB data into memory! do not try this if not on server machine")
 ############################
 # extract measurements
 ############################
 folderpath = sys.argv[1]
 all_pickle_fp = "{}/all.pickle".format(folderpath)
 outputfolder = folderpath.rstrip('/') + '_extracted'
 if not os.path.isdir(outputfolder):
    os.makedirs(outputfolder)
 headline = '#lon,lat,sig,measure_at,upload_at,rating,speed,direction'
 headlines = {
    'UMTS': headline + ',rnc,cidshort,psc',
    'GSM': headline + ',timingadv',
    'LTE': headline + ',timingadv,tac,pci',
    'CDMA': headline + ',sid,nid,bid',
 }
 if os.path.isfile(all_pickle_fp):
    # load data only
    data = pickle.load(open(all_pickle_fp), 'rb')
 else:
    # load and combine data
    files = [file for file in os.listdir(folderpath) if '_parsed.pickle' in file]
    data = {}
    for file in sorted(files, key=lambda x: int(x.rstrip('_parsed.pickle').split('_')[-1])):
        print("loading {}".format(file))
        datatmp = pickle.load(open("{}/{}".format(folderpath, file), 'rb'))
        print("combining..")
        for key in datatmp:
            if key not in data:
                data[key] = datatmp[key]
            else:
                data[key][1] += datatmp[key][1]
    pickle.dump(data, open(all_pickle_fp), 'wb')
 print("extracting each file to disk (only groundtruth we cared)..")
 counter = 0
 for filepath in data:
    if "with_groundtruth" not in filepath:
        continue
    if not os.path.isfile(filepath):
        outputsubfolder = os.path.dirname(filepath)
        if not os.path.isdir(outputsubfolder):
            try:
                os.makedirs(outputsubfolder)
            except BaseException:
                print("err: cannot create folder {}!!!!".format(outputsubfolder))
                continue
        with open(filepath, 'w') as of:
            of.write(headlines.get(data[filepath][0], headline) + '\n')
    with open(filepath, 'a') as of:
        for dataline in data[filepath][1]:
            of.write(dataline + "\n")
    counter += 1
    if counter % 1000 == 0:
        print("progress: {} out of {} files extracted ({:.2f}%)".format(counter, len(data), 100.0 * counter / len(data)))