Changes between Version 13 and Version 14 of AnalyticalStatisticalQuerying


Ignore:
Timestamp:
09/30/25 20:42:24 (13 days ago)
Author:
222077
Comment:

--

Legend:

Unmodified
Added
Removed
Modified
  • AnalyticalStatisticalQuerying

    v13 v14  
    168168
    169169}}}
    170 
    171 
    172170
    173171== 6. Top selling routes for transport organizers ==
     
    210208= Relational Algebra =
    211209
     210== 3. Top user-purchased routes ==
     211
     212==== Original Query
     213{{{
     214WITH route_weighted_usage AS (
     215    SELECT
     216        r.route_id,
     217        l_from.name AS from_location,
     218        l_to.name AS to_location,
     219        to_.company_name AS transport_company,
     220        SUM(
     221            CASE
     222                WHEN DATE_TRUNC('month', t.date_purchased) = DATE_TRUNC('month', CURRENT_DATE)
     223                    THEN 0.5
     224                WHEN DATE_TRUNC('month', t.date_purchased) = DATE_TRUNC('month', CURRENT_DATE - INTERVAL '1 month')
     225                    THEN 0.3
     226                WHEN DATE_TRUNC('month', t.date_purchased) = DATE_TRUNC('month', CURRENT_DATE - INTERVAL '2 months')
     227                    THEN 0.2
     228                ELSE 0
     229            END
     230        ) AS weighted_score
     231    FROM ticket t
     232    JOIN trip tr ON t.trip_id = tr.trip_id
     233    JOIN route r ON tr.route_id = r.route_id
     234    JOIN location l_from ON r.from_location_id = l_from.location_id
     235    JOIN location l_to ON r.to_location_id = l_to.location_id
     236    JOIN transport_organizer to_ ON r.transport_organizer_id = to_.transport_organizer_id
     237    WHERE t.account_id = 300 AND t.date_purchased >= CURRENT_DATE - INTERVAL '3 months'
     238    GROUP BY r.route_id, l_from.name, l_to.name, to_.company_name
     239)
     240SELECT
     241    route_id,
     242    from_location,
     243    to_location,
     244    transport_company,
     245    weighted_score
     246FROM route_weighted_usage
     247WHERE weighted_score >= (SELECT AVG(weighted_score) FROM route_weighted_usage)
     248ORDER BY weighted_score DESC;
     249}}}
     250
     251== Results (Relational Algebra) ==
     252{{{
     253T1 = σ(account_id = 300 ∧ date_purchased ≥ CURRENT_DATE - 3m)(ticket)
     254J1 = T1 ⨝ trip
     255J2 = J1 ⨝ route
     256J3 = J2 ⨝ l_from
     257J4 = J3 ⨝ l_to
     258J5 = J4 ⨝ transport_organizer
     259
     260P1 = π(route_id, l_from.name, l_to.name, to_.company_name, weight)(
     261       extended projection with CASE(date_purchased) → weight
     262     )(J5)
     263
     264G1 = γ(route_id, from_location, to_location, transport_company;
     265        SUM(weight) → weighted_score)(P1)
     266
     267A1 = γ(; AVG(weighted_score) → avg_score)(G1)
     268
     269Result = τ_desc(weighted_score)(σ(weighted_score ≥ avg_score)(G1))
     270}}}
     271
     272== Analysis (Optimizations) ==
     273 * Push selection down: filter `ticket` by `account_id` and `date_purchased` before joins to reduce data early.
     274 * Compute `weight` in a projection step before aggregation for clarity and efficiency.
     275 * Replace correlated subquery for `AVG` with a window function or single aggregation to avoid repeated scans.
     276 * Indexing opportunities:
     277   - `ticket(account_id, date_purchased)` for fast filtering.
     278   - Foreign key indexes on `ticket.trip_id`, `trip.route_id`, and `route.{from_location_id, to_location_id, transport_organizer_id}`.
     279
     280== Conclusion (Rewritten SQL) ==
     281{{{
     282WITH route_scores AS (
     283    SELECT
     284        r.route_id,
     285        l_from.name AS from_location,
     286        l_to.name   AS to_location,
     287        to_.company_name AS transport_company,
     288        SUM(
     289            CASE
     290                WHEN DATE_TRUNC('month', t.date_purchased) = DATE_TRUNC('month', CURRENT_DATE)
     291                    THEN 0.5
     292                WHEN DATE_TRUNC('month', t.date_purchased) = DATE_TRUNC('month', CURRENT_DATE - INTERVAL '1 month')
     293                    THEN 0.3
     294                WHEN DATE_TRUNC('month', t.date_purchased) = DATE_TRUNC('month', CURRENT_DATE - INTERVAL '2 months')
     295                    THEN 0.2
     296                ELSE 0
     297            END
     298        ) AS weighted_score
     299    FROM ticket t
     300    JOIN trip tr ON t.trip_id = tr.trip_id
     301    JOIN route r ON tr.route_id = r.route_id
     302    JOIN location l_from ON r.from_location_id = l_from.location_id
     303    JOIN location l_to   ON r.to_location_id = l_to.location_id
     304    JOIN transport_organizer to_ ON r.transport_organizer_id = to_.transport_organizer_id
     305    WHERE t.account_id = 300
     306      AND t.date_purchased >= CURRENT_DATE - INTERVAL '3 months'
     307    GROUP BY r.route_id, l_from.name, l_to.name, to_.company_name
     308)
     309SELECT *
     310FROM (
     311    SELECT route_scores.*, AVG(weighted_score) OVER () AS avg_score
     312    FROM route_scores
     313) ranked
     314WHERE weighted_score >= avg_score
     315ORDER BY weighted_score DESC;
     316}}}
     317
     318
     319
     320== 5. Most popular destinations in the past year ==
     321
     322==== Original Query
     323{{{
     324WITH destinations AS (
     325    SELECT
     326        l.name AS destination,
     327        COUNT(t.ticket_id) AS total_passengers,
     328        COUNT(DISTINCT t.account_id) AS unique_customers,
     329        SUM(t.price) AS total_revenue,
     330        COUNT(st.student_ticket_id) AS student_tickets,
     331        COUNT(ct.child_ticket_id) AS child_tickets,
     332        l.latitude,
     333        l.longitude
     334    FROM location l
     335    JOIN ticket t ON l.location_id = t.gets_off_location_id
     336    JOIN trip tr ON t.trip_id = tr.trip_id
     337    JOIN transport_organizer to_org ON tr.transport_organizer_id = to_org.transport_organizer_id
     338    LEFT JOIN student_ticket st ON t.ticket_id = st.ticket_id
     339    LEFT JOIN child_ticket ct ON t.ticket_id = ct.ticket_id
     340    GROUP BY l.location_id, l.name, l.latitude, l.longitude
     341),
     342stats AS (
     343    SELECT AVG(total_passengers) AS mean_passengers
     344    FROM destinations
     345)
     346SELECT
     347    d.destination,
     348    d.total_passengers,
     349    d.unique_customers,
     350    d.total_revenue,
     351    d.student_tickets,
     352    d.child_tickets
     353FROM destinations d, stats
     354WHERE d.total_passengers >= stats.mean_passengers
     355ORDER BY d.total_passengers DESC;
     356}}}
     357
     358== Results (Relational Algebra) ==
     359{{{
     360J1 = location ⨝ ticket (l.location_id = t.gets_off_location_id)
     361J2 = J1 ⨝ trip
     362J3 = J2 ⨝ transport_organizer
     363J4 = J3 ⟕ student_ticket
     364J5 = J4 ⟕ child_ticket
     365
     366G1 = γ(l.location_id, l.name, l.latitude, l.longitude;
     367        COUNT(ticket_id) → total_passengers,
     368        COUNT_DISTINCT(account_id) → unique_customers,
     369        SUM(price) → total_revenue,
     370        COUNT(student_ticket_id) → student_tickets,
     371        COUNT(child_ticket_id) → child_tickets)(J5)
     372
     373A1 = γ(; AVG(total_passengers) → mean_passengers)(G1)
     374
     375Result = τ_desc(total_passengers)(σ(total_passengers ≥ mean_passengers)(G1))
     376}}}
     377
     378== Analysis (Optimizations) ==
     379 * Push projections early: keep only `location_id`, `ticket_id`, `account_id`, `price`, and join keys before grouping.
     380 * Apply the date filter (`past year`) explicitly in the `WHERE` clause on `ticket.date_purchased` (missing in original).
     381 * Replace cross join with `stats` by using a window function (`AVG() OVER ()`) to avoid scanning `destinations` twice.
     382 * Indexing opportunities:
     383   - `ticket(gets_off_location_id, date_purchased)`
     384   - `student_ticket.ticket_id`, `child_ticket.ticket_id`
     385   - Foreign keys on `trip.route_id`, `trip.transport_organizer_id`.
     386
     387== Conclusion (Rewritten SQL) ==
     388{{{
     389WITH destinations AS (
     390    SELECT
     391        l.name AS destination,
     392        COUNT(t.ticket_id) AS total_passengers,
     393        COUNT(DISTINCT t.account_id) AS unique_customers,
     394        SUM(t.price) AS total_revenue,
     395        COUNT(st.student_ticket_id) AS student_tickets,
     396        COUNT(ct.child_ticket_id) AS child_tickets,
     397        l.latitude,
     398        l.longitude
     399    FROM location l
     400    JOIN ticket t ON l.location_id = t.gets_off_location_id
     401    JOIN trip tr ON t.trip_id = tr.trip_id
     402    JOIN transport_organizer to_org ON tr.transport_organizer_id = to_org.transport_organizer_id
     403    LEFT JOIN student_ticket st ON t.ticket_id = st.ticket_id
     404    LEFT JOIN child_ticket ct ON t.ticket_id = ct.ticket_id
     405    WHERE t.date_purchased >= CURRENT_DATE - INTERVAL '1 year'
     406    GROUP BY l.location_id, l.name, l.latitude, l.longitude
     407)
     408SELECT *
     409FROM (
     410    SELECT d.*,
     411           AVG(total_passengers) OVER () AS mean_passengers
     412    FROM destinations d
     413) ranked
     414WHERE total_passengers >= mean_passengers
     415ORDER BY total_passengers DESC;
     416}}}
    212417
    213418== 6. Top selling routes for transport organizers ==
     
    293498ORDER BY COUNT(tk.ticket_id) DESC;
    294499}}}
    295