Update 001.data_extraction.py
This commit is contained in:
parent
aee45136df
commit
ab5c0c3f55
|
|
@ -5,8 +5,10 @@ import sys
|
|||
import pickle
|
||||
|
||||
# global
|
||||
folderpath = sys.argv[1] # folder containing all measurement_xx.csv
|
||||
celltowers_fp = sys.argv[2] # csv file containing all cell_tower info
|
||||
folderpath = sys.argv[1]
|
||||
celltowers_fp = sys.argv[2]
|
||||
celltowers_fp_with_gt = celltowers_fp.replace('.csv', '_with_gt.pickle')
|
||||
celltowers_fp_no_gt = celltowers_fp.replace('.csv', '_no_gt.pickle')
|
||||
|
||||
|
||||
############################
|
||||
|
|
@ -14,56 +16,61 @@ celltowers_fp = sys.argv[2] # csv file containing all cell_tower info
|
|||
############################
|
||||
celltowers_cared = {}
|
||||
celltowers_not_cared = {}
|
||||
|
||||
try:
|
||||
firstLine = True
|
||||
with open(celltowers_fp, 'r') as f:
|
||||
lines = f.readlines()
|
||||
for line in lines:
|
||||
if firstLine:
|
||||
firstLine = False
|
||||
continue
|
||||
tmp = line.rstrip().split(',')
|
||||
radio = tmp[0] # Network type. One of the strings GSM, UMTS, LTE or CDMA.
|
||||
mcc = tmp[1] # mobile country code
|
||||
net = tmp[2] # For GSM, UMTS and LTE networks, this is the Mobile Network Code (MNC).
|
||||
# For CDMA networks, this is the System IDentification number (SID).
|
||||
area = tmp[3] # Location Area Code (LAC) for GSM and UMTS networks.
|
||||
# Tracking Area Code (TAC) for LTE networks.
|
||||
# Network IDenfitication number (NID) for CDMA networks.
|
||||
cell = tmp[4] # Cell ID (CID) for GSM and LTE networks.
|
||||
# UTRAN Cell ID / LCID for UMTS networks,
|
||||
# which is the concatenation of 2 or 4 bytes of
|
||||
# Radio Network Controller (RNC) code and 4 bytes of Cell ID.
|
||||
# Base station IDentifier number (BID) for CDMA networks.
|
||||
unit = tmp[5] # Primary Scrambling Code (PSC) for UMTS networks.
|
||||
# Physical Cell ID (PCI) for LTE networks.
|
||||
# An empty value for GSM and CDMA networks.
|
||||
lon = float(tmp[6]) if not tmp[6] == '' else float('nan')
|
||||
lat = float(tmp[7]) if not tmp[7] == '' else float('nan')
|
||||
cellrange = tmp[8] # Estimate of cell range, in meters.
|
||||
samples = tmp[9] # Total number of measurements assigned to the cell tower
|
||||
changeable = int(tmp[10])
|
||||
created = tmp[11]
|
||||
updated = tmp[12]
|
||||
averageSignal = tmp[13]
|
||||
key = "{}_{}_{}_{}_{}".format(radio, mcc, net, area, cell)
|
||||
mydict = celltowers_not_cared
|
||||
if changeable is 0:
|
||||
mydict = celltowers_cared
|
||||
mydict[key] = {
|
||||
'location': (lon, lat),
|
||||
'mcc': int(mcc) if not mcc == '' else None,
|
||||
'net': int(net) if not net == '' else None,
|
||||
'area': int(area) if not area == '' else None,
|
||||
'cell': int(cell) if not cell == '' else None,
|
||||
'unit': int(unit) if not unit == '' else None,
|
||||
'cellrange': cellrange,
|
||||
'samples': samples,
|
||||
'counter': 0
|
||||
}
|
||||
except BaseException:
|
||||
raise
|
||||
if os.path.isfile(celltowers_fp_with_gt) and os.path.isfile(celltowers_fp_no_gt):
|
||||
celltowers_cared = pickle.load(celltowers_fp_with_gt, 'rb')
|
||||
celltowers_not_cared = pickle.load(celltowers_fp_no_gt, 'rb')
|
||||
else:
|
||||
try:
|
||||
firstLine = True
|
||||
with open(celltowers_fp, 'r') as f:
|
||||
lines = f.readlines()
|
||||
for line in lines:
|
||||
if firstLine:
|
||||
firstLine = False
|
||||
continue
|
||||
tmp = line.rstrip().split(',')
|
||||
radio = tmp[0] # Network type. One of the strings GSM, UMTS, LTE or CDMA.
|
||||
mcc = tmp[1] # mobile country code
|
||||
net = tmp[2] # For GSM, UMTS and LTE networks, this is the Mobile Network Code (MNC).
|
||||
# For CDMA networks, this is the System IDentification number (SID).
|
||||
area = tmp[3] # Location Area Code (LAC) for GSM and UMTS networks.
|
||||
# Tracking Area Code (TAC) for LTE networks.
|
||||
# Network IDenfitication number (NID) for CDMA networks.
|
||||
cell = tmp[4] # Cell ID (CID) for GSM and LTE networks.
|
||||
# UTRAN Cell ID / LCID for UMTS networks,
|
||||
# which is the concatenation of 2 or 4 bytes of
|
||||
# Radio Network Controller (RNC) code and 4 bytes of Cell ID.
|
||||
# Base station IDentifier number (BID) for CDMA networks.
|
||||
unit = tmp[5] # Primary Scrambling Code (PSC) for UMTS networks.
|
||||
# Physical Cell ID (PCI) for LTE networks.
|
||||
# An empty value for GSM and CDMA networks.
|
||||
lon = float(tmp[6]) if not tmp[6] == '' else float('nan')
|
||||
lat = float(tmp[7]) if not tmp[7] == '' else float('nan')
|
||||
cellrange = tmp[8] # Estimate of cell range, in meters.
|
||||
samples = tmp[9] # Total number of measurements assigned to the cell tower
|
||||
changeable = int(tmp[10])
|
||||
created = tmp[11]
|
||||
updated = tmp[12]
|
||||
averageSignal = tmp[13]
|
||||
key = "{}_{}_{}_{}_{}".format(radio, mcc, net, area, cell)
|
||||
mydict = celltowers_not_cared
|
||||
if changeable is 0:
|
||||
mydict = celltowers_cared
|
||||
mydict[key] = {
|
||||
'location': (lon, lat),
|
||||
'mcc': int(mcc) if not mcc == '' else None,
|
||||
'net': int(net) if not net == '' else None,
|
||||
'area': int(area) if not area == '' else None,
|
||||
'cell': int(cell) if not cell == '' else None,
|
||||
'unit': int(unit) if not unit == '' else None,
|
||||
'cellrange': cellrange,
|
||||
'samples': samples,
|
||||
'counter': 0
|
||||
}
|
||||
except BaseException:
|
||||
raise
|
||||
pickle.dump(celltowers_cared, open(celltowers_fp_with_gt, 'wb'))
|
||||
pickle.dump(celltowers_not_cared, open(celltowers_fp_no_gt, 'wb'))
|
||||
|
||||
|
||||
############################
|
||||
|
|
@ -83,102 +90,134 @@ if not os.path.isdir(outputfolder):
|
|||
else:
|
||||
print("folder exists, will not overwrite but append")
|
||||
|
||||
|
||||
files = [file for file in os.listdir(folderpath) if '.csv' in file]
|
||||
for file in sorted(files, key=lambda x: x.rstrip('.csv').split('_')[-1]):
|
||||
print("looking at file: {}".format(file))
|
||||
with open("{}/{}".format(folderpath, file), 'r') as f:
|
||||
lines = f.readlines()
|
||||
for line in lines:
|
||||
tmp = line.rstrip().split(',')
|
||||
# mobile country code
|
||||
mcc = int(tmp[0]) if not tmp[0] == '' else float('nan')
|
||||
# For GSM, UMTS and LTE networks, this is the Mobile Network Code (MNC).
|
||||
# For CDMA networks, this is the System IDentification number (SID).
|
||||
net = int(tmp[1]) if not tmp[1] == '' else float('nan')
|
||||
# Location Area Code (LAC) for GSM and UMTS networks.
|
||||
# Tracking Area Code (TAC) for LTE networks.
|
||||
# Network IDenfitication number (NID) for CDMA networks.
|
||||
area = int(tmp[2]) if not tmp[2] == '' else float('nan')
|
||||
# Cell ID (CID) for GSM and LTE networks.
|
||||
# UTRAN Cell ID / LCID for UMTS networks,
|
||||
# which is the concatenation of 2 or 4 bytes of
|
||||
# Radio Network Controller (RNC) code and 4 bytes of Cell ID.
|
||||
# Base station IDentifier number (BID) for CDMA networks.
|
||||
cell = int(tmp[3]) if not tmp[3] == '' else float('nan')
|
||||
# measurement location
|
||||
lon = float(tmp[4]) if not tmp[4] == '' else float('nan')
|
||||
lat = float(tmp[5]) if not tmp[5] == '' else float('nan')
|
||||
# signal level
|
||||
sig = int(tmp[6]) if not tmp[6] == '' else float('nan')
|
||||
# measured and uploaded-to-opencellid time
|
||||
measure_t = int(tmp[7]) if not tmp[7] == '' else float('nan')
|
||||
created_t = int(tmp[8]) if not tmp[8] == '' else float('nan')
|
||||
# GPS quality/accuracy information (metres)
|
||||
rating = float(tmp[9]) if not tmp[9] == '' else float('nan')
|
||||
# Speed when creating the measurement; both metres/second and km/h is accepted.
|
||||
speed = float(tmp[10]) if not tmp[10] == '' else float('nan')
|
||||
# Heading direction of the phone / telematics device at the moment
|
||||
# the measurement was created (0=north, 90=east, 180=south, 270=west)
|
||||
direction = float(tmp[11]) if not tmp[11] == '' else float('nan')
|
||||
# Network type. One of the strings GSM, UMTS, LTE or CDMA
|
||||
radio = tmp[12]
|
||||
# Timing advance; only for GSM and LTE
|
||||
timingadv = int(tmp[13]) if not tmp[13] == '' else float('nan')
|
||||
# Radio network controller; only for UMTS
|
||||
rnc = int(tmp[14]) if not tmp[14] == '' else float('nan')
|
||||
# Cell id (short); only for UMTS
|
||||
cidshort = int(tmp[15]) if not tmp[15] == '' else float('nan')
|
||||
# Primary scrambling code; only for UMTS
|
||||
psc = int(tmp[16]) if not tmp[16] == '' else float('nan')
|
||||
# Tracking area code; only for LTE
|
||||
tac = int(tmp[17]) if not tmp[17] == '' else float('nan')
|
||||
# Physical cell id; only for LTE
|
||||
pci = int(tmp[18]) if not tmp[18] == '' else float('nan')
|
||||
# System identifier; only for CDMA
|
||||
sid = int(tmp[19]) if not tmp[19] == '' else float('nan')
|
||||
# Network id; only for CDMA
|
||||
nid = int(tmp[20]) if not tmp[20] == '' else float('nan')
|
||||
# Base station id; only for CDMA
|
||||
bid = int(tmp[21]) if not tmp[21] == '' else float('nan')
|
||||
# format dataline
|
||||
dataline = (
|
||||
"{:.6f},{:.6f},{},{},{},{:.2f},{:.2f},{:.0f}"
|
||||
.format(lon, lat, sig, measure_t, created_t, rating, speed, direction)
|
||||
)
|
||||
if radio == 'UMTS':
|
||||
dataline += ",{},{},{}".format(rnc, cidshort, psc)
|
||||
elif radio == 'GSM':
|
||||
dataline += ",{}".format(timingadv)
|
||||
elif radio == 'LTE':
|
||||
dataline += ",{},{},{}".format(timingadv, tac, pci)
|
||||
elif radio == 'CDMA':
|
||||
dataline += ",{},{},{}".format(sid, nid, bid)
|
||||
# get key in previous dict
|
||||
key = "{}_{}_{}_{}_{}".format(radio, mcc, net, area, cell)
|
||||
# folder level: outputfolder/with(out)_groundtruth/radio/mcc/net/area/
|
||||
if key in celltowers_cared:
|
||||
mydict = celltowers_cared
|
||||
outputsubfolder = "{}/with_groundtruth/{}/{}/{}/{}".format(outputfolder, radio, mcc, net, area)
|
||||
elif key in celltowers_not_cared:
|
||||
mydict = celltowers_not_cared
|
||||
outputsubfolder = "{}/without_groundtruth/{}/{}/{}/{}".format(outputfolder, radio, mcc, net, area)
|
||||
else:
|
||||
mydict = {}
|
||||
outputsubfolder = "{}/unseen/{}/{}/{}/{}".format(outputfolder, radio, mcc, net, area)
|
||||
try:
|
||||
if not os.path.isdir(outputsubfolder):
|
||||
os.makedirs(outputsubfolder)
|
||||
except BaseException:
|
||||
raise
|
||||
gt_loc = mydict.get(key, {}).get('location', (float('nan'), float('nan')))
|
||||
filepath = "{}/{}_{:.6f}_{:.6f}.csv".format(outputsubfolder, cell, gt_loc[0], gt_loc[1])
|
||||
if not os.path.isfile(filepath):
|
||||
with open(filepath, 'w') as of:
|
||||
of.write(headlines.get(radio, headline) + '\n')
|
||||
with open(filepath, 'a') as of:
|
||||
of.write(dataline + '\n')
|
||||
if key in mydict:
|
||||
mydict[key]['counter'] += 1
|
||||
|
||||
pickle.dump(celltowers_cared, open(celltowers_fp.replace('.csv', '_with_gt.pickle'), 'wb'))
|
||||
pickle.dump(celltowers_not_cared, open(celltowers_fp.replace('.csv', '_no_gt.pickle'), 'wb'))
|
||||
print("looking at file: {}".format(file))
|
||||
pickle_file = file.replace('.csv', '_parsed.pickle')
|
||||
|
||||
if os.path.isfile("{}/{}".format(folderpath, pickle_file)):
|
||||
print("found the pickle, loading directly..")
|
||||
mybufferdict = pickle.load(open("{}/{}".format(folderpath, pickle_file), 'rb'))
|
||||
|
||||
print("extracting each file to disk..")
|
||||
counter = 0
|
||||
for filepath in mybufferdict:
|
||||
if not os.path.isfile(filepath):
|
||||
with open(filepath, 'w') as of:
|
||||
of.write(headlines.get(mybufferdict[filepath][0], headline) + '\n')
|
||||
with open(filepath, 'a') as of:
|
||||
for dataline in mybufferdict[filepath][1]:
|
||||
of.write(dataline + "\n")
|
||||
counter += 1
|
||||
if counter % 1000 == 0:
|
||||
print("progress: {} out of {} extracted ({:.2f}%)".format(counter, len(mybufferdict), 100.0 * counter / len(mybufferdict)))
|
||||
else:
|
||||
print("parsing and loading in memory..")
|
||||
with open("{}/{}".format(folderpath, file), 'r') as f:
|
||||
lines = f.readlines()
|
||||
mybufferdict = {}
|
||||
counter = 0
|
||||
for line in lines:
|
||||
tmp = line.rstrip().split(',')
|
||||
# mobile country code
|
||||
mcc = int(tmp[0]) if not tmp[0] == '' else float('nan')
|
||||
# For GSM, UMTS and LTE networks, this is the Mobile Network Code (MNC).
|
||||
# For CDMA networks, this is the System IDentification number (SID).
|
||||
net = int(tmp[1]) if not tmp[1] == '' else float('nan')
|
||||
# Location Area Code (LAC) for GSM and UMTS networks.
|
||||
# Tracking Area Code (TAC) for LTE networks.
|
||||
# Network IDenfitication number (NID) for CDMA networks.
|
||||
area = int(tmp[2]) if not tmp[2] == '' else float('nan')
|
||||
# Cell ID (CID) for GSM and LTE networks.
|
||||
# UTRAN Cell ID / LCID for UMTS networks,
|
||||
# which is the concatenation of 2 or 4 bytes of
|
||||
# Radio Network Controller (RNC) code and 4 bytes of Cell ID.
|
||||
# Base station IDentifier number (BID) for CDMA networks.
|
||||
cell = int(tmp[3]) if not tmp[3] == '' else float('nan')
|
||||
# measurement location
|
||||
lon = float(tmp[4]) if not tmp[4] == '' else float('nan')
|
||||
lat = float(tmp[5]) if not tmp[5] == '' else float('nan')
|
||||
# signal level
|
||||
if tmp[6] == '':
|
||||
continue
|
||||
sig = int(tmp[6]) if not tmp[6] == '' else float('nan')
|
||||
# measured and uploaded-to-opencellid time
|
||||
measure_t = int(tmp[7]) if not tmp[7] == '' else float('nan')
|
||||
created_t = int(tmp[8]) if not tmp[8] == '' else float('nan')
|
||||
# GPS quality/accuracy information (metres)
|
||||
rating = float(tmp[9]) if not tmp[9] == '' else float('nan')
|
||||
# Speed when creating the measurement; both metres/second and km/h is accepted.
|
||||
speed = float(tmp[10]) if not tmp[10] == '' else float('nan')
|
||||
# Heading direction of the phone / telematics device at the moment
|
||||
# the measurement was created (0=north, 90=east, 180=south, 270=west)
|
||||
direction = float(tmp[11]) if not tmp[11] == '' else float('nan')
|
||||
# Network type. One of the strings GSM, UMTS, LTE or CDMA
|
||||
radio = tmp[12]
|
||||
# Timing advance; only for GSM and LTE
|
||||
timingadv = int(tmp[13]) if not tmp[13] == '' else float('nan')
|
||||
# Radio network controller; only for UMTS
|
||||
rnc = int(tmp[14]) if not tmp[14] == '' else float('nan')
|
||||
# Cell id (short); only for UMTS
|
||||
cidshort = int(tmp[15]) if not tmp[15] == '' else float('nan')
|
||||
# Primary scrambling code; only for UMTS
|
||||
psc = int(tmp[16]) if not tmp[16] == '' else float('nan')
|
||||
# Tracking area code; only for LTE
|
||||
tac = int(tmp[17]) if not tmp[17] == '' else float('nan')
|
||||
# Physical cell id; only for LTE
|
||||
pci = int(tmp[18]) if not tmp[18] == '' else float('nan')
|
||||
# System identifier; only for CDMA
|
||||
sid = int(tmp[19]) if not tmp[19] == '' else float('nan')
|
||||
# Network id; only for CDMA
|
||||
nid = int(tmp[20]) if not tmp[20] == '' else float('nan')
|
||||
# Base station id; only for CDMA
|
||||
bid = int(tmp[21]) if not tmp[21] == '' else float('nan')
|
||||
# format dataline
|
||||
dataline = (
|
||||
"{:.6f},{:.6f},{},{},{},{:.2f},{:.2f},{:.0f}"
|
||||
.format(lon, lat, sig, measure_t, created_t, rating, speed, direction)
|
||||
)
|
||||
if radio == 'UMTS':
|
||||
dataline += ",{},{},{}".format(rnc, cidshort, psc)
|
||||
elif radio == 'GSM':
|
||||
dataline += ",{}".format(timingadv)
|
||||
elif radio == 'LTE':
|
||||
dataline += ",{},{},{}".format(timingadv, tac, pci)
|
||||
elif radio == 'CDMA':
|
||||
dataline += ",{},{},{}".format(sid, nid, bid)
|
||||
# get key in previous dict
|
||||
key = "{}_{}_{}_{}_{}".format(radio, mcc, net, area, cell)
|
||||
# folder level: outputfolder/with(out)_groundtruth/radio/mcc/net/area/
|
||||
if key in celltowers_cared:
|
||||
mydict = celltowers_cared
|
||||
outputsubfolder = "{}/with_groundtruth/{}/{}/{}/{}".format(outputfolder, radio, mcc, net, area)
|
||||
elif key in celltowers_not_cared:
|
||||
mydict = celltowers_not_cared
|
||||
outputsubfolder = "{}/without_groundtruth/{}/{}/{}/{}".format(outputfolder, radio, mcc, net, area)
|
||||
else:
|
||||
mydict = {}
|
||||
outputsubfolder = "{}/unseen/{}/{}/{}/{}".format(outputfolder, radio, mcc, net, area)
|
||||
try:
|
||||
if not os.path.isdir(outputsubfolder):
|
||||
os.makedirs(outputsubfolder)
|
||||
except BaseException:
|
||||
raise
|
||||
gt_loc = mydict.get(key, {}).get('location', (float('nan'), float('nan')))
|
||||
filepath = "{}/{}_{:.6f}_{:.6f}.csv".format(outputsubfolder, cell, gt_loc[0], gt_loc[1])
|
||||
if filepath not in mybufferdict:
|
||||
mybufferdict[filepath] = [radio, []]
|
||||
mybufferdict[filepath][1].append(dataline)
|
||||
if key in mydict:
|
||||
mydict[key]['counter'] += 1
|
||||
counter += 1
|
||||
if counter % 10000 == 0:
|
||||
print("progress: {} out of {} parsed ({:.2f}%)".format(counter, len(lines), 100.0 * counter / len(lines)))
|
||||
|
||||
print("dumping into pickle..")
|
||||
pickle.dump(mybufferdict, open("{}/{}".format(folderpath, pickle_file), 'wb'))
|
||||
|
||||
|
||||
pickle.dump(celltowers_cared, open(celltowers_fp_with_gt.replace("_gt", "_gt_counted"), 'wb'))
|
||||
pickle.dump(celltowers_not_cared, open(celltowers_fp_no_gt.replace("_gt", "_gt_counted"), 'wb'))
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue