import warnings

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sb
from scipy import stats

warnings.filterwarnings("ignore")

sb.set_theme()

DOWNLOAD_FRESH_DATA = True
data_file = "data.csv"

if DOWNLOAD_FRESH_DATA:
    from pathlib import Path

    import requests
    url = Path("url.txt").read_text()
    print("downloading fresh data")
    resp = requests.get(url)
    resp.raise_for_status()
    Path(data_file).write_text(resp.text)


df = pd.read_csv(data_file)

display(df.head())
display(df.describe())

downloading fresh data

df.Timestamp = pd.to_datetime(df.Timestamp)
df["morning"] = df.Timestamp.dt.hour < 12
df["hour"] = df.Timestamp.dt.hour + df.Timestamp.dt.minute/60 + df.Timestamp.dt.second/3600
df["day"] = df.Timestamp.dt.date
df.head()

# ensure no more errant spaces break the notebook
df.Event = df.Event.str.strip()

from itertools import product


g = df.set_index(["morning", "Event"])
for index in product([True, False], ["Enter", "Exit"]):
    display(index)
    display(g.loc[index].hour.describe())

(True, 'Enter')

count    186.000000
mean       7.124108
std        0.541980
min        5.168333
25%        6.940417
50%        7.121806
75%        7.287847
max        9.043056
Name: hour, dtype: float64

(True, 'Exit')

count    186.000000
mean       7.645827
std        0.567747
min        5.620556
25%        7.467847
50%        7.655833
75%        7.924722
max        9.507222
Name: hour, dtype: float64

(False, 'Enter')

count    172.000000
mean      17.367600
std        1.098670
min       13.498889
25%       16.648264
50%       17.422500
75%       17.894861
max       20.247222
Name: hour, dtype: float64

(False, 'Exit')

count    172.000000
mean      17.943891
std        1.070946
min       14.117500
25%       17.225347
50%       17.977361
75%       18.466042
max       20.777778
Name: hour, dtype: float64

# reshape data
trip = df.pivot_table(index=["day", "morning"], columns="Event").reset_index()
# flatten columns
trip.columns = ["_".join(c) if c[1] else c[0] for c in trip.columns.to_flat_index()]
# calculate new information
trip["ridetime"] = (trip.hour_Exit - trip.hour_Enter) * 60.0
trip["Direction"] = trip.morning.map({True: "To work", False: "From work"})
# preview and save copy
display(trip.head())
trip.to_csv("trip.csv", index=False)

display(trip[trip.morning == True].ridetime.describe())
display(trip[trip.morning == False].ridetime.describe())

count    186.000000
mean      31.303136
std        3.523828
min       23.983333
25%       29.300000
50%       31.066667
75%       32.541667
max       51.066667
Name: ridetime, dtype: float64

count    172.000000
mean      34.577422
std        3.954678
min       25.466667
25%       31.704167
50%       33.983333
75%       36.737500
max       45.216667
Name: ridetime, dtype: float64

ax: plt.Axes = sb.histplot(data=trip, x="ridetime")
ax.set_xlabel("Ride length (min)")
ax.figure.tight_layout()

ax: plt.Axes = sb.histplot(data=trip, x="hour_Enter", hue="Direction", binwidth=0.5)
ax.set_xlabel("Enter time, 30 min bins")
ax.figure.tight_layout()

ax: plt.Axes = sb.boxplot(data=trip, x="ridetime", hue="Direction")
ax.figure.set_size_inches(w=10, h=4)
ax.set_xlabel("Ride length (min)")
ax.figure.tight_layout()

ax: plt.Axes = sb.lineplot(data=trip, x="Timestamp_Enter", y="ridetime", hue="Direction")
ax.figure.set_size_inches(w=14, h=4)
ax.set_xlabel("Date")
ax.set_ylabel("Ride length (min)")
ax.figure.tight_layout()

ride_to_work = trip[trip.morning == True].ridetime
ride_from_work = trip[trip.morning == False].ridetime
if ride_to_work.hasnans or ride_from_work.hasnans:
    print("NaNs found in ridetimes")

n = min(len(ride_to_work), len(ride_from_work))
print(f"{n=}")

result = stats.ttest_ind(ride_to_work.to_list()[:n], ride_from_work.to_list()[:n], equal_var=False)
print(result)
print(result.confidence_interval())

alpha = 0.01  # 99%
if result.pvalue < alpha:
    print("Reject H0 in favor of HA that average to-work and from-work ride times are not equal.")
else:
    print("Do not reject H0 that average to-work and from-work ride times are equal.")

n=172
TtestResult(statistic=np.float64(-7.923175942018825), pvalue=np.float64(3.375544224704507e-14), df=np.float64(338.3723532885828))
ConfidenceInterval(low=np.float64(-4.014870168078449), high=np.float64(-2.417881769906071))
Reject H0 in favor of HA that average to-work and from-work ride times are not equal.

	Timestamp	Event
count	716	716
unique	716	2
top	6/19/2025 20:19:01	Enter
freq	1	358

	day	morning	Timestamp_Enter	Timestamp_Exit	hour_Enter	hour_Exit	ridetime	Direction
0	2024-07-16	False	2024-07-16 19:33:00	2024-07-16 20:05:00	19.550000	20.083333	32.000000	From work
1	2024-07-16	True	2024-07-16 08:01:00	2024-07-16 08:33:00	8.016667	8.550000	32.000000	To work
2	2024-07-17	False	2024-07-17 17:13:29	2024-07-17 17:49:28	17.224722	17.824444	35.983333	From work
3	2024-07-18	False	2024-07-18 17:30:37	2024-07-18 18:06:03	17.510278	18.100833	35.433333	From work
4	2024-07-18	True	2024-07-18 07:25:46	2024-07-18 07:56:16	7.429444	7.937778	30.500000	To work

Bus ride times¶

Get data¶

Destructure time¶

Transform¶

Plots¶

Distribution of all ride times¶

Distribution of entry time by direction¶

Box plot of ride times by direction¶

Time series of ride times by direction¶

T-Test of mean directional ride times¶

Conclusion¶

	Timestamp	Event
0	7/16/2024 8:01:00	Enter
1	7/16/2024 8:33:00	Exit
2	7/16/2024 19:33:00	Enter
3	7/16/2024 20:05:00	Exit
4	7/17/2024 17:13:29	Enter