Refactored

This commit is contained in:
Johannes Randerath 2024-06-11 22:16:45 +02:00
parent 1dd906a87f
commit 9f7c360096
4 changed files with 77 additions and 262 deletions

View File

@ -28,6 +28,7 @@ jaraco.collections==5.0.1
jaraco.context==5.3.0 jaraco.context==5.3.0
jaraco.functools==4.0.1 jaraco.functools==4.0.1
jaraco.text==3.12.0 jaraco.text==3.12.0
jedi==0.19.1
Jinja2==3.1.4 Jinja2==3.1.4
joblib==1.4.2 joblib==1.4.2
lxml==5.2.2 lxml==5.2.2
@ -40,6 +41,7 @@ numpy==1.26.4
packaging==24.0 packaging==24.0
pandas==2.2.2 pandas==2.2.2
parsimonious==0.10.0 parsimonious==0.10.0
parso==0.8.4
Pattern==3.6 Pattern==3.6
pdfminer.six==20231228 pdfminer.six==20231228
platformdirs==4.2.2 platformdirs==4.2.2

View File

@ -7,10 +7,16 @@ Contents
-------- --------
Constants Constants
--------- ---------
gtfs_schema : dir{str,list[str]}
Maps GTFS file names (without filename extension) to fields described by the GTFS Reference
reversed_file_mapping : dict(str,str) reversed_file_mapping : dict(str,str)
Map CamelCased filenames to '_'-separated Map CamelCased filenames to '_'-separated
class_names : dict{str,str}
Map CamelCase, singularized class names to pluralized, snake_cased file names
primary_keys : dict{str, (str or None)}
For all pt_map.models, map primary keys if applicable
foreign_keys
For all pt_map.models, map foreign keys if any. Also ordered for model creation without foreign reference conflicts.
time_delta : int
Unix time for Jan 1, 2024. To be used to calculate time prefix strings.
Functions Functions
--------- ---------
@ -43,248 +49,10 @@ import time
import datetime import datetime
import django.db.models import django.db.models
import time import time
from pt_map.gtfs_schema import gtfs_schema
time_delta = int(datetime.datetime(2024,1,1).timestamp()) time_delta = int(datetime.datetime(2024,1,1).timestamp())
gtfs_schema = {
"agency": [
"agency_id",
"agency_name",
"agency_url",
"agency_timezone",
"agency_lang",
"agency_phone",
"agency_email",
"agency_fare_url"
],
"stops": [
"stop_id",
"stop_code",
"stop_name",
"stop_desc",
"stop_lat",
"stop_lon",
"zone_id",
"stop_url",
"location_type",
"parent_station",
"stop_timezone",
"wheelchair_boarding",
"level_id",
"platform_code"
],
"routes": [
"route_id",
"agency_id",
"route_short_name",
"route_long_name",
"route_desc",
"route_type",
"route_url",
"route_color",
"route_text_color",
"route_sort_order",
"continuous_pickup",
"continuous_drop_off"
],
"trips": [
"trip_id",
"route_id",
"service_id",
"trip_headsign",
"trip_short_name",
"direction_id",
"block_id",
"shape_id",
"wheelchair_accessible",
"bikes_allowed"
],
"stop_times": [
"trip_id",
"arrival_time",
"departure_time",
"stop_id",
"stop_sequence",
"stop_headsign",
"pickup_type",
"drop_off_type",
"shape_dist_traveled",
"timepoint"
],
"calendar": [
"service_id",
"monday",
"tuesday",
"wednesday",
"thursday",
"friday",
"saturday",
"sunday",
"start_date",
"end_date"
],
"calendar_dates": [
"service_id",
"date",
"exception_type"
],
"fare_attributes": [
"fare_id",
"price",
"currency_type",
"payment_method",
"transfers",
"transfer_duration"
],
"fare_rules": [
"fare_id",
"route_id",
"origin_id",
"destination_id",
"contains_id"
],
"timeframes": [
"timeframe_id",
"start_time",
"end_time",
"headway_sec",
"exact_times"
],
"fare_media": [
"media_id",
"agency_id",
"fare_id",
"seat_type",
"price"
],
"fare_products": [
"product_id",
"agency_id",
"product_type",
"fare_id",
"product_name",
"short_name",
"description",
"duration",
"transfers"
],
"fare_leg_rules": [
"fare_id",
"route_id",
"origin_id",
"destination_id",
"contains_id"
],
"fare_transfer_rules": [
"from_fare_id",
"to_fare_id",
"transfer_type",
"min_transfer_time"
],
"areas": [
"area_id",
"area_name",
"area_description"
],
"stop_areas": [
"stop_area_id",
"stop_id",
"area_id",
"location_type",
"parent_station",
"fare_zone_id"
],
"networks": [
"network_id",
"network_name",
"network_description"
],
"route_networks": [
"route_id",
"network_id"
],
"shapes": [
"shape_id",
"shape_pt_lat",
"shape_pt_lon",
"shape_pt_sequence",
"shape_dist_traveled"
],
"frequencies": [
"trip_id",
"start_time",
"end_time",
"headway_secs",
"exact_times"
],
"transfers": [
"from_stop_id",
"to_stop_id",
"transfer_type",
"min_transfer_time"
],
"pathways": [
"pathway_id",
"from_stop_id",
"to_stop_id",
"pathway_mode",
"is_bidirectional",
"length",
"traversal_time",
"stair_count",
"max_slope",
"min_width",
"signposted_as",
"reversed_signposted_as"
],
"levels": [
"level_id",
"level_index",
"level_name"
],
"location_groups": [
"location_group_id",
"location_group_name"
],
"location_group_stops": [
"location_group_id",
"stop_id"
],
"locations_geojson": [
"type",
"features"
],
"booking_rules": [
"rule_id",
"stop_id",
"rule_type",
"booking_url",
"admission_rules",
"admission_requirements"
],
"translations": [
"table_name",
"field_name",
"language",
"translation"
],
"feed_info": [
"feed_publisher_name",
"feed_publisher_url",
"feed_lang",
"default_lang",
"feed_start_date",
"feed_end_date",
"feed_version",
"feed_contact_email",
"feed_contact_url"
],
"attributions": [
"attribution_id",
"organization_name",
"is_producer"
]
}
primary_keys = { pt_map.models.Agency: "agency_id", primary_keys = { pt_map.models.Agency: "agency_id",
@ -432,6 +200,18 @@ def stdz(v, m: django.db.models.Model, f: str):
return v return v
def to_snake_case(name): def to_snake_case(name):
"""
Convert CamelCase to snake_case.
Parameters
----------
name : str
str in CamelCase
Returns
-------
Str in snake_case
"""
name = name[0].lower() + name[1:] name = name[0].lower() + name[1:]
for c in name[1:]: for c in name[1:]:
if c.isupper(): if c.isupper():
@ -441,6 +221,20 @@ def to_snake_case(name):
return name return name
def unqfk(ts, fk): def unqfk(ts, fk):
"""
Primary keys of imported data and in the database are likely to overlap. To avoid this, the current time in seconds since Jan 1, 2024 is added as a prefix.
Foreign key references must know of this new key so they are processed in the same way. To make this possible, we use the same time in seconds for all objects.
Parameters
----------
ts : str
time in seconds to be prepended
fk : primary or foreign key to be processed.
Returns
-------
Str with prefix
"""
if not isinstance(fk, str): if not isinstance(fk, str):
fk = str(int(fk)) fk = str(int(fk))
return f"{ts}{fk}".strip() return f"{ts}{fk}".strip()
@ -454,28 +248,23 @@ def gtfs_to_db(g: pt_map.gtfs.GTFS):
g : gtfs.GTFS g : gtfs.GTFS
GTFS object to be saved to db GTFS object to be saved to db
""" """
ts = str(int(time.time())-time_delta) ts = str(int(time.time())-time_delta) # Prepend the current time in seconds since Jan 1, 2024 to ids to make them more or less unique
for model in foreign_keys: for model in foreign_keys:
if model[0] in [pt_map.models.Calendar, pt_map.models.CalendarDate, ]:
continue
m = model[0] m = model[0]
df = getattr(g, class_names[m.__name__]).data df = getattr(g, class_names[m.__name__]).data # Extract dataframe for each model from gtfs.GTFS object
if not df.empty: if not df.empty: # Only process GTFS files actually present
v = gtfs_schema[class_names[m.__name__]] v = gtfs_schema[class_names[m.__name__]] # field names
for _, row in df.iterrows(): for _, row in df.iterrows(): # the rows of the dataframe are the individual entries in the GTFS file and should be the individual instances of the db model
for fk in model[1]: for fk in model[1]: # Map foreign_keys to objects of the foreign model
if row.get(fk[1]): if row.get(fk[1]):
row[fk[1]] = fk[0].objects.get(**{primary_keys[fk[0]]: unqfk(ts, row[fk[1]])}) row[fk[1]] = fk[0].objects.get(**{primary_keys[fk[0]]: unqfk(ts, row[fk[1]])})
defaults = {field: stdz(row.get(field), m, field) for field in v if row.get(field) and not is_NaN(row[field])} defaults = {field: stdz(row.get(field), m, field) for field in v if row.get(field) and not is_NaN(row[field])} # dict of fields and values of current model object to create
print(model[0])
if model[0] == pt_map.models.StopTime:
print(row)
if primary_keys[m]: if primary_keys[m]:
row[primary_keys[m]] = unqfk(ts, row[primary_keys[m]]) row[primary_keys[m]] = unqfk(ts, row[primary_keys[m]]) # primary_keys should be unique, use current time in seconds as a prefix
defaults[primary_keys[m]] = row[primary_keys[m]] defaults[primary_keys[m]] = row[primary_keys[m]]
try: try:
m.objects.get(**{primary_keys[m]: row[primary_keys[m]]}) m.objects.get(**{primary_keys[m]: row[primary_keys[m]]}) # Make sure there is no object with identical primary_key, exception is expected to be risen
except m.DoesNotExist: except m.DoesNotExist:
m.objects.update_or_create( m.objects.update_or_create(
defaults = defaults, defaults = defaults,

View File

@ -1,4 +1,11 @@
"""
Make gtfs_schema constant available to modules in package without circular imports.
Constants
---------
gtfs_schema : dict{str,list[str]}
Maps GTFS file names (without filename extension) to fields described by the GTFS Reference
"""
gtfs_schema = { gtfs_schema = {
"agency": [ "agency": [
"agency_id", "agency_id",
@ -238,3 +245,4 @@ gtfs_schema = {
"is_producer" "is_producer"
] ]
} }

View File

@ -15,12 +15,28 @@ from .forms import *
import json import json
from datetime import datetime from datetime import datetime
def print_r(r, s):
if not len(r):
print(s)
return r
def get_timetable(r, trips, stop_sequences): def get_timetable(r, trips, stop_sequences):
"""
Given a pt_map.models.Route, calculate the timetable for all its stops.
Parameters
----------
r : pt_map.models.Route
Route, the timetable should be calculated for
trips : dict(str, list(pt_map.Trip))
Dictionary mapping all trips to route_ids they travel on
stop_sequences : dict(str, list(str))
Dict mapping route_ids to lists of stop_ids they serve. Currently the first trip is taken as reference for stops and sequence.
Returns
-------
dict{"stop_sequence": list(str), "stop_times": dict(str, list(str)}
Dict containing two elements:
"stop_sequence" : list(str)
list of stop_ids the route serves
"stop_times" : dict(str, list(str))
dict mapping stop_ids from stop_sequence to time strings the route is serving the stop at
"""
timetable = {"stop_sequence": stop_sequences[r.route_id]} timetable = {"stop_sequence": stop_sequences[r.route_id]}
sts = {} sts = {}
for stop in stop_sequences[r.route_id]: for stop in stop_sequences[r.route_id]: