From ab5c0c3f55a72530cc53311f05012136acfe4fa4 Mon Sep 17 00:00:00 2001 From: HappyZ Date: Sun, 9 Jun 2019 19:18:49 -0500 Subject: [PATCH] Update 001.data_extraction.py --- opencellid_parsing/001.data_extraction.py | 335 ++++++++++++---------- 1 file changed, 187 insertions(+), 148 deletions(-) diff --git a/opencellid_parsing/001.data_extraction.py b/opencellid_parsing/001.data_extraction.py index 3021703..7d4b79a 100644 --- a/opencellid_parsing/001.data_extraction.py +++ b/opencellid_parsing/001.data_extraction.py @@ -5,8 +5,10 @@ import sys import pickle # global -folderpath = sys.argv[1] # folder containing all measurement_xx.csv -celltowers_fp = sys.argv[2] # csv file containing all cell_tower info +folderpath = sys.argv[1] +celltowers_fp = sys.argv[2] +celltowers_fp_with_gt = celltowers_fp.replace('.csv', '_with_gt.pickle') +celltowers_fp_no_gt = celltowers_fp.replace('.csv', '_no_gt.pickle') ############################ @@ -14,56 +16,61 @@ celltowers_fp = sys.argv[2] # csv file containing all cell_tower info ############################ celltowers_cared = {} celltowers_not_cared = {} - -try: - firstLine = True - with open(celltowers_fp, 'r') as f: - lines = f.readlines() - for line in lines: - if firstLine: - firstLine = False - continue - tmp = line.rstrip().split(',') - radio = tmp[0] # Network type. One of the strings GSM, UMTS, LTE or CDMA. - mcc = tmp[1] # mobile country code - net = tmp[2] # For GSM, UMTS and LTE networks, this is the Mobile Network Code (MNC). - # For CDMA networks, this is the System IDentification number (SID). - area = tmp[3] # Location Area Code (LAC) for GSM and UMTS networks. - # Tracking Area Code (TAC) for LTE networks. - # Network IDenfitication number (NID) for CDMA networks. - cell = tmp[4] # Cell ID (CID) for GSM and LTE networks. - # UTRAN Cell ID / LCID for UMTS networks, - # which is the concatenation of 2 or 4 bytes of - # Radio Network Controller (RNC) code and 4 bytes of Cell ID. - # Base station IDentifier number (BID) for CDMA networks. - unit = tmp[5] # Primary Scrambling Code (PSC) for UMTS networks. - # Physical Cell ID (PCI) for LTE networks. - # An empty value for GSM and CDMA networks. - lon = float(tmp[6]) if not tmp[6] == '' else float('nan') - lat = float(tmp[7]) if not tmp[7] == '' else float('nan') - cellrange = tmp[8] # Estimate of cell range, in meters. - samples = tmp[9] # Total number of measurements assigned to the cell tower - changeable = int(tmp[10]) - created = tmp[11] - updated = tmp[12] - averageSignal = tmp[13] - key = "{}_{}_{}_{}_{}".format(radio, mcc, net, area, cell) - mydict = celltowers_not_cared - if changeable is 0: - mydict = celltowers_cared - mydict[key] = { - 'location': (lon, lat), - 'mcc': int(mcc) if not mcc == '' else None, - 'net': int(net) if not net == '' else None, - 'area': int(area) if not area == '' else None, - 'cell': int(cell) if not cell == '' else None, - 'unit': int(unit) if not unit == '' else None, - 'cellrange': cellrange, - 'samples': samples, - 'counter': 0 - } -except BaseException: - raise +if os.path.isfile(celltowers_fp_with_gt) and os.path.isfile(celltowers_fp_no_gt): + celltowers_cared = pickle.load(celltowers_fp_with_gt, 'rb') + celltowers_not_cared = pickle.load(celltowers_fp_no_gt, 'rb') +else: + try: + firstLine = True + with open(celltowers_fp, 'r') as f: + lines = f.readlines() + for line in lines: + if firstLine: + firstLine = False + continue + tmp = line.rstrip().split(',') + radio = tmp[0] # Network type. One of the strings GSM, UMTS, LTE or CDMA. + mcc = tmp[1] # mobile country code + net = tmp[2] # For GSM, UMTS and LTE networks, this is the Mobile Network Code (MNC). + # For CDMA networks, this is the System IDentification number (SID). + area = tmp[3] # Location Area Code (LAC) for GSM and UMTS networks. + # Tracking Area Code (TAC) for LTE networks. + # Network IDenfitication number (NID) for CDMA networks. + cell = tmp[4] # Cell ID (CID) for GSM and LTE networks. + # UTRAN Cell ID / LCID for UMTS networks, + # which is the concatenation of 2 or 4 bytes of + # Radio Network Controller (RNC) code and 4 bytes of Cell ID. + # Base station IDentifier number (BID) for CDMA networks. + unit = tmp[5] # Primary Scrambling Code (PSC) for UMTS networks. + # Physical Cell ID (PCI) for LTE networks. + # An empty value for GSM and CDMA networks. + lon = float(tmp[6]) if not tmp[6] == '' else float('nan') + lat = float(tmp[7]) if not tmp[7] == '' else float('nan') + cellrange = tmp[8] # Estimate of cell range, in meters. + samples = tmp[9] # Total number of measurements assigned to the cell tower + changeable = int(tmp[10]) + created = tmp[11] + updated = tmp[12] + averageSignal = tmp[13] + key = "{}_{}_{}_{}_{}".format(radio, mcc, net, area, cell) + mydict = celltowers_not_cared + if changeable is 0: + mydict = celltowers_cared + mydict[key] = { + 'location': (lon, lat), + 'mcc': int(mcc) if not mcc == '' else None, + 'net': int(net) if not net == '' else None, + 'area': int(area) if not area == '' else None, + 'cell': int(cell) if not cell == '' else None, + 'unit': int(unit) if not unit == '' else None, + 'cellrange': cellrange, + 'samples': samples, + 'counter': 0 + } + except BaseException: + raise + pickle.dump(celltowers_cared, open(celltowers_fp_with_gt, 'wb')) + pickle.dump(celltowers_not_cared, open(celltowers_fp_no_gt, 'wb')) ############################ @@ -83,102 +90,134 @@ if not os.path.isdir(outputfolder): else: print("folder exists, will not overwrite but append") + files = [file for file in os.listdir(folderpath) if '.csv' in file] for file in sorted(files, key=lambda x: x.rstrip('.csv').split('_')[-1]): - print("looking at file: {}".format(file)) - with open("{}/{}".format(folderpath, file), 'r') as f: - lines = f.readlines() - for line in lines: - tmp = line.rstrip().split(',') - # mobile country code - mcc = int(tmp[0]) if not tmp[0] == '' else float('nan') - # For GSM, UMTS and LTE networks, this is the Mobile Network Code (MNC). - # For CDMA networks, this is the System IDentification number (SID). - net = int(tmp[1]) if not tmp[1] == '' else float('nan') - # Location Area Code (LAC) for GSM and UMTS networks. - # Tracking Area Code (TAC) for LTE networks. - # Network IDenfitication number (NID) for CDMA networks. - area = int(tmp[2]) if not tmp[2] == '' else float('nan') - # Cell ID (CID) for GSM and LTE networks. - # UTRAN Cell ID / LCID for UMTS networks, - # which is the concatenation of 2 or 4 bytes of - # Radio Network Controller (RNC) code and 4 bytes of Cell ID. - # Base station IDentifier number (BID) for CDMA networks. - cell = int(tmp[3]) if not tmp[3] == '' else float('nan') - # measurement location - lon = float(tmp[4]) if not tmp[4] == '' else float('nan') - lat = float(tmp[5]) if not tmp[5] == '' else float('nan') - # signal level - sig = int(tmp[6]) if not tmp[6] == '' else float('nan') - # measured and uploaded-to-opencellid time - measure_t = int(tmp[7]) if not tmp[7] == '' else float('nan') - created_t = int(tmp[8]) if not tmp[8] == '' else float('nan') - # GPS quality/accuracy information (metres) - rating = float(tmp[9]) if not tmp[9] == '' else float('nan') - # Speed when creating the measurement; both metres/second and km/h is accepted. - speed = float(tmp[10]) if not tmp[10] == '' else float('nan') - # Heading direction of the phone / telematics device at the moment - # the measurement was created (0=north, 90=east, 180=south, 270=west) - direction = float(tmp[11]) if not tmp[11] == '' else float('nan') - # Network type. One of the strings GSM, UMTS, LTE or CDMA - radio = tmp[12] - # Timing advance; only for GSM and LTE - timingadv = int(tmp[13]) if not tmp[13] == '' else float('nan') - # Radio network controller; only for UMTS - rnc = int(tmp[14]) if not tmp[14] == '' else float('nan') - # Cell id (short); only for UMTS - cidshort = int(tmp[15]) if not tmp[15] == '' else float('nan') - # Primary scrambling code; only for UMTS - psc = int(tmp[16]) if not tmp[16] == '' else float('nan') - # Tracking area code; only for LTE - tac = int(tmp[17]) if not tmp[17] == '' else float('nan') - # Physical cell id; only for LTE - pci = int(tmp[18]) if not tmp[18] == '' else float('nan') - # System identifier; only for CDMA - sid = int(tmp[19]) if not tmp[19] == '' else float('nan') - # Network id; only for CDMA - nid = int(tmp[20]) if not tmp[20] == '' else float('nan') - # Base station id; only for CDMA - bid = int(tmp[21]) if not tmp[21] == '' else float('nan') - # format dataline - dataline = ( - "{:.6f},{:.6f},{},{},{},{:.2f},{:.2f},{:.0f}" - .format(lon, lat, sig, measure_t, created_t, rating, speed, direction) - ) - if radio == 'UMTS': - dataline += ",{},{},{}".format(rnc, cidshort, psc) - elif radio == 'GSM': - dataline += ",{}".format(timingadv) - elif radio == 'LTE': - dataline += ",{},{},{}".format(timingadv, tac, pci) - elif radio == 'CDMA': - dataline += ",{},{},{}".format(sid, nid, bid) - # get key in previous dict - key = "{}_{}_{}_{}_{}".format(radio, mcc, net, area, cell) - # folder level: outputfolder/with(out)_groundtruth/radio/mcc/net/area/ - if key in celltowers_cared: - mydict = celltowers_cared - outputsubfolder = "{}/with_groundtruth/{}/{}/{}/{}".format(outputfolder, radio, mcc, net, area) - elif key in celltowers_not_cared: - mydict = celltowers_not_cared - outputsubfolder = "{}/without_groundtruth/{}/{}/{}/{}".format(outputfolder, radio, mcc, net, area) - else: - mydict = {} - outputsubfolder = "{}/unseen/{}/{}/{}/{}".format(outputfolder, radio, mcc, net, area) - try: - if not os.path.isdir(outputsubfolder): - os.makedirs(outputsubfolder) - except BaseException: - raise - gt_loc = mydict.get(key, {}).get('location', (float('nan'), float('nan'))) - filepath = "{}/{}_{:.6f}_{:.6f}.csv".format(outputsubfolder, cell, gt_loc[0], gt_loc[1]) - if not os.path.isfile(filepath): - with open(filepath, 'w') as of: - of.write(headlines.get(radio, headline) + '\n') - with open(filepath, 'a') as of: - of.write(dataline + '\n') - if key in mydict: - mydict[key]['counter'] += 1 -pickle.dump(celltowers_cared, open(celltowers_fp.replace('.csv', '_with_gt.pickle'), 'wb')) -pickle.dump(celltowers_not_cared, open(celltowers_fp.replace('.csv', '_no_gt.pickle'), 'wb')) + print("looking at file: {}".format(file)) + pickle_file = file.replace('.csv', '_parsed.pickle') + + if os.path.isfile("{}/{}".format(folderpath, pickle_file)): + print("found the pickle, loading directly..") + mybufferdict = pickle.load(open("{}/{}".format(folderpath, pickle_file), 'rb')) + + print("extracting each file to disk..") + counter = 0 + for filepath in mybufferdict: + if not os.path.isfile(filepath): + with open(filepath, 'w') as of: + of.write(headlines.get(mybufferdict[filepath][0], headline) + '\n') + with open(filepath, 'a') as of: + for dataline in mybufferdict[filepath][1]: + of.write(dataline + "\n") + counter += 1 + if counter % 1000 == 0: + print("progress: {} out of {} extracted ({:.2f}%)".format(counter, len(mybufferdict), 100.0 * counter / len(mybufferdict))) + else: + print("parsing and loading in memory..") + with open("{}/{}".format(folderpath, file), 'r') as f: + lines = f.readlines() + mybufferdict = {} + counter = 0 + for line in lines: + tmp = line.rstrip().split(',') + # mobile country code + mcc = int(tmp[0]) if not tmp[0] == '' else float('nan') + # For GSM, UMTS and LTE networks, this is the Mobile Network Code (MNC). + # For CDMA networks, this is the System IDentification number (SID). + net = int(tmp[1]) if not tmp[1] == '' else float('nan') + # Location Area Code (LAC) for GSM and UMTS networks. + # Tracking Area Code (TAC) for LTE networks. + # Network IDenfitication number (NID) for CDMA networks. + area = int(tmp[2]) if not tmp[2] == '' else float('nan') + # Cell ID (CID) for GSM and LTE networks. + # UTRAN Cell ID / LCID for UMTS networks, + # which is the concatenation of 2 or 4 bytes of + # Radio Network Controller (RNC) code and 4 bytes of Cell ID. + # Base station IDentifier number (BID) for CDMA networks. + cell = int(tmp[3]) if not tmp[3] == '' else float('nan') + # measurement location + lon = float(tmp[4]) if not tmp[4] == '' else float('nan') + lat = float(tmp[5]) if not tmp[5] == '' else float('nan') + # signal level + if tmp[6] == '': + continue + sig = int(tmp[6]) if not tmp[6] == '' else float('nan') + # measured and uploaded-to-opencellid time + measure_t = int(tmp[7]) if not tmp[7] == '' else float('nan') + created_t = int(tmp[8]) if not tmp[8] == '' else float('nan') + # GPS quality/accuracy information (metres) + rating = float(tmp[9]) if not tmp[9] == '' else float('nan') + # Speed when creating the measurement; both metres/second and km/h is accepted. + speed = float(tmp[10]) if not tmp[10] == '' else float('nan') + # Heading direction of the phone / telematics device at the moment + # the measurement was created (0=north, 90=east, 180=south, 270=west) + direction = float(tmp[11]) if not tmp[11] == '' else float('nan') + # Network type. One of the strings GSM, UMTS, LTE or CDMA + radio = tmp[12] + # Timing advance; only for GSM and LTE + timingadv = int(tmp[13]) if not tmp[13] == '' else float('nan') + # Radio network controller; only for UMTS + rnc = int(tmp[14]) if not tmp[14] == '' else float('nan') + # Cell id (short); only for UMTS + cidshort = int(tmp[15]) if not tmp[15] == '' else float('nan') + # Primary scrambling code; only for UMTS + psc = int(tmp[16]) if not tmp[16] == '' else float('nan') + # Tracking area code; only for LTE + tac = int(tmp[17]) if not tmp[17] == '' else float('nan') + # Physical cell id; only for LTE + pci = int(tmp[18]) if not tmp[18] == '' else float('nan') + # System identifier; only for CDMA + sid = int(tmp[19]) if not tmp[19] == '' else float('nan') + # Network id; only for CDMA + nid = int(tmp[20]) if not tmp[20] == '' else float('nan') + # Base station id; only for CDMA + bid = int(tmp[21]) if not tmp[21] == '' else float('nan') + # format dataline + dataline = ( + "{:.6f},{:.6f},{},{},{},{:.2f},{:.2f},{:.0f}" + .format(lon, lat, sig, measure_t, created_t, rating, speed, direction) + ) + if radio == 'UMTS': + dataline += ",{},{},{}".format(rnc, cidshort, psc) + elif radio == 'GSM': + dataline += ",{}".format(timingadv) + elif radio == 'LTE': + dataline += ",{},{},{}".format(timingadv, tac, pci) + elif radio == 'CDMA': + dataline += ",{},{},{}".format(sid, nid, bid) + # get key in previous dict + key = "{}_{}_{}_{}_{}".format(radio, mcc, net, area, cell) + # folder level: outputfolder/with(out)_groundtruth/radio/mcc/net/area/ + if key in celltowers_cared: + mydict = celltowers_cared + outputsubfolder = "{}/with_groundtruth/{}/{}/{}/{}".format(outputfolder, radio, mcc, net, area) + elif key in celltowers_not_cared: + mydict = celltowers_not_cared + outputsubfolder = "{}/without_groundtruth/{}/{}/{}/{}".format(outputfolder, radio, mcc, net, area) + else: + mydict = {} + outputsubfolder = "{}/unseen/{}/{}/{}/{}".format(outputfolder, radio, mcc, net, area) + try: + if not os.path.isdir(outputsubfolder): + os.makedirs(outputsubfolder) + except BaseException: + raise + gt_loc = mydict.get(key, {}).get('location', (float('nan'), float('nan'))) + filepath = "{}/{}_{:.6f}_{:.6f}.csv".format(outputsubfolder, cell, gt_loc[0], gt_loc[1]) + if filepath not in mybufferdict: + mybufferdict[filepath] = [radio, []] + mybufferdict[filepath][1].append(dataline) + if key in mydict: + mydict[key]['counter'] += 1 + counter += 1 + if counter % 10000 == 0: + print("progress: {} out of {} parsed ({:.2f}%)".format(counter, len(lines), 100.0 * counter / len(lines))) + + print("dumping into pickle..") + pickle.dump(mybufferdict, open("{}/{}".format(folderpath, pickle_file), 'wb')) + + +pickle.dump(celltowers_cared, open(celltowers_fp_with_gt.replace("_gt", "_gt_counted"), 'wb')) +pickle.dump(celltowers_not_cared, open(celltowers_fp_no_gt.replace("_gt", "_gt_counted"), 'wb')) +