still too slow to dump each file, separate pickle into three pickles instead
This commit is contained in:
parent
ae72ee1816
commit
f085380ad7
|
|
@ -12,22 +12,13 @@ print("load ~55GB data into memory! do not try this if not on server machine")
|
|||
############################
|
||||
folderpath = sys.argv[1]
|
||||
all_pickle_fp = "{}/all.pickle".format(folderpath)
|
||||
|
||||
outputfolder = folderpath.rstrip('/') + '_extracted'
|
||||
if not os.path.isdir(outputfolder):
|
||||
os.makedirs(outputfolder)
|
||||
|
||||
headline = '#lon,lat,sig,measure_at,upload_at,rating,speed,direction'
|
||||
headlines = {
|
||||
'UMTS': headline + ',rnc,cidshort,psc',
|
||||
'GSM': headline + ',timingadv',
|
||||
'LTE': headline + ',timingadv,tac,pci',
|
||||
'CDMA': headline + ',sid,nid,bid',
|
||||
}
|
||||
pickle_fp_with_gt = "{}/all_with_groundtruth.pickle".format(folderpath)
|
||||
pickle_fp_no_gt = "{}/all_no_groundtruth.pickle".format(folderpath)
|
||||
pickle_fp_unseen = "{}/all_unseen.pickle".format(folderpath)
|
||||
|
||||
if os.path.isfile(all_pickle_fp):
|
||||
# load data only
|
||||
data = pickle.load(open(all_pickle_fp), 'rb')
|
||||
data = pickle.load(open(all_pickle_fp, 'rb'))
|
||||
else:
|
||||
# load and combine data
|
||||
files = [file for file in os.listdir(folderpath) if '_parsed.pickle' in file]
|
||||
|
|
@ -45,24 +36,41 @@ else:
|
|||
pickle.dump(data, open(all_pickle_fp, 'wb'))
|
||||
|
||||
|
||||
print("extracting each file to disk (only groundtruth we cared)..")
|
||||
counter = 0
|
||||
for filepath in data:
|
||||
if "with_groundtruth" not in filepath:
|
||||
continue
|
||||
if not os.path.isfile(filepath):
|
||||
outputsubfolder = os.path.dirname(filepath)
|
||||
if not os.path.isdir(outputsubfolder):
|
||||
try:
|
||||
os.makedirs(outputsubfolder)
|
||||
except BaseException:
|
||||
print("err: cannot create folder {}!!!!".format(outputsubfolder))
|
||||
continue
|
||||
with open(filepath, 'w') as of:
|
||||
of.write(headlines.get(data[filepath][0], headline) + '\n')
|
||||
with open(filepath, 'a') as of:
|
||||
for dataline in data[filepath][1]:
|
||||
of.write(dataline + "\n")
|
||||
counter += 1
|
||||
if counter % 1000 == 0:
|
||||
print("progress: {} out of {} files extracted ({:.2f}%)".format(counter, len(data), 100.0 * counter / len(data)))
|
||||
data_with_gt = {}
|
||||
data_no_gt = {}
|
||||
data_unseen = {}
|
||||
|
||||
for key in data:
|
||||
if "with_groundtruth" in key:
|
||||
data_with_gt[key] = data[key]
|
||||
elif "without_groundtruth" in key:
|
||||
data_no_gt[key] = data[key]
|
||||
elif "unseen" in key:
|
||||
data_unseen[key] = data[key]
|
||||
|
||||
pickle.dump(data_with_gt, open(pickle_fp_with_gt, 'wb'))
|
||||
pickle.dump(data_no_gt, open(pickle_no_gt, 'wb'))
|
||||
pickle.dump(data_unseen, open(pickle_fp_unseen, 'wb'))
|
||||
|
||||
|
||||
# print("extracting each file to disk (only groundtruth we cared)..")
|
||||
# counter = 0
|
||||
# for filepath in data:
|
||||
# if "with_groundtruth" not in filepath:
|
||||
# continue
|
||||
# if not os.path.isfile(filepath):
|
||||
# outputsubfolder = os.path.dirname(filepath)
|
||||
# if not os.path.isdir(outputsubfolder):
|
||||
# try:
|
||||
# os.makedirs(outputsubfolder)
|
||||
# except BaseException:
|
||||
# print("err: cannot create folder {}!!!!".format(outputsubfolder))
|
||||
# continue
|
||||
# with open(filepath, 'w') as of:
|
||||
# of.write(headlines.get(data[filepath][0], headline) + '\n')
|
||||
# with open(filepath, 'a') as of:
|
||||
# for dataline in data[filepath][1]:
|
||||
# of.write(dataline + "\n")
|
||||
# counter += 1
|
||||
# if counter % 1000 == 0:
|
||||
# print("progress: {} out of {} files extracted ({:.2f}%)".format(counter, len(data), 100.0 * counter / len(data)))
|
||||
|
|
|
|||
|
|
@ -0,0 +1,55 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import os
|
||||
import sys
|
||||
import pickle
|
||||
|
||||
outputfolder = folderpath.rstrip('/') + '_extracted'
|
||||
if not os.path.isdir(outputfolder):
|
||||
os.makedirs(outputfolder)
|
||||
|
||||
headline = '#lon,lat,sig,measure_at,upload_at,rating,speed,direction'
|
||||
headlines = {
|
||||
'UMTS': headline + ',rnc,cidshort,psc',
|
||||
'GSM': headline + ',timingadv',
|
||||
'LTE': headline + ',timingadv,tac,pci',
|
||||
'CDMA': headline + ',sid,nid,bid',
|
||||
}
|
||||
|
||||
############################
|
||||
# extract measurements
|
||||
############################
|
||||
folderpath = sys.argv[1]
|
||||
pickle_fp_with_gt = "{}/all_with_groundtruth.pickle".format(folderpath)
|
||||
|
||||
if not os.path.isfile(pickle_fp_with_gt):
|
||||
print("run 002 script first")
|
||||
sys.exit(2)
|
||||
|
||||
data = pickle.load(open(pickle_fp_with_gt, 'rb'))
|
||||
|
||||
print("extracting each file to disk (only groundtruth we cared)..")
|
||||
counter = 0
|
||||
counter_valid = 0
|
||||
for filepath in data:
|
||||
counter += 1
|
||||
if counter % 1000 == 0:
|
||||
print("progress: {} out of {} files extracted ({:.2f}%)".format(counter, len(data), 100.0 * counter / len(data)))
|
||||
if len(data[filepath][1]) < 10:
|
||||
print("{}: entry less than 10, skipping".format(filepath))
|
||||
continue
|
||||
counter_valid += 1
|
||||
if not os.path.isfile(filepath):
|
||||
outputsubfolder = os.path.dirname(filepath)
|
||||
if not os.path.isdir(outputsubfolder):
|
||||
try:
|
||||
os.makedirs(outputsubfolder)
|
||||
except BaseException:
|
||||
print("err: cannot create folder {}!!!!".format(outputsubfolder))
|
||||
continue
|
||||
with open(filepath, 'w') as of:
|
||||
of.write(headlines.get(data[filepath][0], headline) + '\n')
|
||||
with open(filepath, 'a') as of:
|
||||
for dataline in data[filepath][1]:
|
||||
of.write(dataline + "\n")
|
||||
print("done: {} out of {} files extracted ({:.2f}%)".format(counter_valid, len(data), 100.0 * counter_valid / len(data)))
|
||||
Loading…
Reference in New Issue