separate parse and extract data into two files, bug fix
This commit is contained in:
parent
ab5c0c3f55
commit
d579a7d122
|
|
@ -92,28 +92,12 @@ else:
|
||||||
|
|
||||||
|
|
||||||
files = [file for file in os.listdir(folderpath) if '.csv' in file]
|
files = [file for file in os.listdir(folderpath) if '.csv' in file]
|
||||||
for file in sorted(files, key=lambda x: x.rstrip('.csv').split('_')[-1]):
|
for file in sorted(files, key=lambda x: int(x.rstrip('.csv').split('_')[-1])):
|
||||||
|
|
||||||
print("looking at file: {}".format(file))
|
print("looking at file: {}".format(file))
|
||||||
pickle_file = file.replace('.csv', '_parsed.pickle')
|
pickle_file = file.replace('.csv', '_parsed.pickle')
|
||||||
|
|
||||||
if os.path.isfile("{}/{}".format(folderpath, pickle_file)):
|
if not os.path.isfile("{}/{}".format(folderpath, pickle_file)):
|
||||||
print("found the pickle, loading directly..")
|
|
||||||
mybufferdict = pickle.load(open("{}/{}".format(folderpath, pickle_file), 'rb'))
|
|
||||||
|
|
||||||
print("extracting each file to disk..")
|
|
||||||
counter = 0
|
|
||||||
for filepath in mybufferdict:
|
|
||||||
if not os.path.isfile(filepath):
|
|
||||||
with open(filepath, 'w') as of:
|
|
||||||
of.write(headlines.get(mybufferdict[filepath][0], headline) + '\n')
|
|
||||||
with open(filepath, 'a') as of:
|
|
||||||
for dataline in mybufferdict[filepath][1]:
|
|
||||||
of.write(dataline + "\n")
|
|
||||||
counter += 1
|
|
||||||
if counter % 1000 == 0:
|
|
||||||
print("progress: {} out of {} extracted ({:.2f}%)".format(counter, len(mybufferdict), 100.0 * counter / len(mybufferdict)))
|
|
||||||
else:
|
|
||||||
print("parsing and loading in memory..")
|
print("parsing and loading in memory..")
|
||||||
with open("{}/{}".format(folderpath, file), 'r') as f:
|
with open("{}/{}".format(folderpath, file), 'r') as f:
|
||||||
lines = f.readlines()
|
lines = f.readlines()
|
||||||
|
|
@ -198,11 +182,6 @@ for file in sorted(files, key=lambda x: x.rstrip('.csv').split('_')[-1]):
|
||||||
else:
|
else:
|
||||||
mydict = {}
|
mydict = {}
|
||||||
outputsubfolder = "{}/unseen/{}/{}/{}/{}".format(outputfolder, radio, mcc, net, area)
|
outputsubfolder = "{}/unseen/{}/{}/{}/{}".format(outputfolder, radio, mcc, net, area)
|
||||||
try:
|
|
||||||
if not os.path.isdir(outputsubfolder):
|
|
||||||
os.makedirs(outputsubfolder)
|
|
||||||
except BaseException:
|
|
||||||
raise
|
|
||||||
gt_loc = mydict.get(key, {}).get('location', (float('nan'), float('nan')))
|
gt_loc = mydict.get(key, {}).get('location', (float('nan'), float('nan')))
|
||||||
filepath = "{}/{}_{:.6f}_{:.6f}.csv".format(outputsubfolder, cell, gt_loc[0], gt_loc[1])
|
filepath = "{}/{}_{:.6f}_{:.6f}.csv".format(outputsubfolder, cell, gt_loc[0], gt_loc[1])
|
||||||
if filepath not in mybufferdict:
|
if filepath not in mybufferdict:
|
||||||
|
|
@ -0,0 +1,68 @@
|
||||||
|
#!/usr/bin/python
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
print("load ~55GB data into memory! do not try this if not on server machine")
|
||||||
|
|
||||||
|
|
||||||
|
############################
|
||||||
|
# extract measurements
|
||||||
|
############################
|
||||||
|
folderpath = sys.argv[1]
|
||||||
|
all_pickle_fp = "{}/all.pickle".format(folderpath)
|
||||||
|
|
||||||
|
outputfolder = folderpath.rstrip('/') + '_extracted'
|
||||||
|
if not os.path.isdir(outputfolder):
|
||||||
|
os.makedirs(outputfolder)
|
||||||
|
|
||||||
|
headline = '#lon,lat,sig,measure_at,upload_at,rating,speed,direction'
|
||||||
|
headlines = {
|
||||||
|
'UMTS': headline + ',rnc,cidshort,psc',
|
||||||
|
'GSM': headline + ',timingadv',
|
||||||
|
'LTE': headline + ',timingadv,tac,pci',
|
||||||
|
'CDMA': headline + ',sid,nid,bid',
|
||||||
|
}
|
||||||
|
|
||||||
|
if os.path.isfile(all_pickle_fp):
|
||||||
|
# load data only
|
||||||
|
data = pickle.load(open(all_pickle_fp), 'rb')
|
||||||
|
else:
|
||||||
|
# load and combine data
|
||||||
|
files = [file for file in os.listdir(folderpath) if '_parsed.pickle' in file]
|
||||||
|
data = {}
|
||||||
|
for file in sorted(files, key=lambda x: int(x.rstrip('_parsed.pickle').split('_')[-1])):
|
||||||
|
print("loading {}".format(file))
|
||||||
|
datatmp = pickle.load(open("{}/{}".format(folderpath, file), 'rb'))
|
||||||
|
print("combining..")
|
||||||
|
for key in datatmp:
|
||||||
|
if key not in data:
|
||||||
|
data[key] = datatmp[key]
|
||||||
|
else:
|
||||||
|
data[key][1] += datatmp[key][1]
|
||||||
|
|
||||||
|
pickle.dump(data, open(all_pickle_fp), 'wb')
|
||||||
|
|
||||||
|
|
||||||
|
print("extracting each file to disk (only groundtruth we cared)..")
|
||||||
|
counter = 0
|
||||||
|
for filepath in data:
|
||||||
|
if "with_groundtruth" not in filepath:
|
||||||
|
continue
|
||||||
|
if not os.path.isfile(filepath):
|
||||||
|
outputsubfolder = os.path.dirname(filepath)
|
||||||
|
if not os.path.isdir(outputsubfolder):
|
||||||
|
try:
|
||||||
|
os.makedirs(outputsubfolder)
|
||||||
|
except BaseException:
|
||||||
|
print("err: cannot create folder {}!!!!".format(outputsubfolder))
|
||||||
|
continue
|
||||||
|
with open(filepath, 'w') as of:
|
||||||
|
of.write(headlines.get(data[filepath][0], headline) + '\n')
|
||||||
|
with open(filepath, 'a') as of:
|
||||||
|
for dataline in data[filepath][1]:
|
||||||
|
of.write(dataline + "\n")
|
||||||
|
counter += 1
|
||||||
|
if counter % 1000 == 0:
|
||||||
|
print("progress: {} out of {} files extracted ({:.2f}%)".format(counter, len(data), 100.0 * counter / len(data)))
|
||||||
Loading…
Reference in New Issue