Pandas Cheat Sheet#

Setup Environment#

# ensures you are running the pip version associated with the current Python kernel
import sys
!{sys.executable} -m pip install requests
!{sys.executable} -m pip install scipy
!{sys.executable} -m pip install matplotlib
!{sys.executable} -m pip install scikit-learn
!{sys.executable} -m pip install plotly
Requirement already satisfied: requests in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (2.28.1)
Requirement already satisfied: charset-normalizer<3,>=2 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from requests) (2.1.1)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from requests) (2022.12.7)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from requests) (1.26.13)
Requirement already satisfied: idna<4,>=2.5 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from requests) (3.4)
Requirement already satisfied: scipy in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (1.9.3)
Requirement already satisfied: numpy<1.26.0,>=1.18.5 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from scipy) (1.23.5)
Requirement already satisfied: matplotlib in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (3.6.2)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from matplotlib) (1.4.4)
Requirement already satisfied: cycler>=0.10 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from matplotlib) (0.11.0)
Requirement already satisfied: pyparsing>=2.2.1 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from matplotlib) (3.0.9)
Requirement already satisfied: numpy>=1.19 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from matplotlib) (1.23.5)
Requirement already satisfied: packaging>=20.0 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from matplotlib) (22.0)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from matplotlib) (4.38.0)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from matplotlib) (2.8.2)
Requirement already satisfied: pillow>=6.2.0 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from matplotlib) (9.3.0)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from matplotlib) (1.0.6)
Requirement already satisfied: six>=1.5 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)
Requirement already satisfied: scikit-learn in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (1.2.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from scikit-learn) (3.1.0)
Requirement already satisfied: numpy>=1.17.3 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from scikit-learn) (1.23.5)
Requirement already satisfied: joblib>=1.1.1 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from scikit-learn) (1.2.0)
Requirement already satisfied: scipy>=1.3.2 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from scikit-learn) (1.9.3)
Requirement already satisfied: plotly in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (5.11.0)
Requirement already satisfied: tenacity>=6.2.0 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from plotly) (8.1.0)
import pandas as pd
import numpy as np
import json
import requests

import datetime as dt
from dateutil.relativedelta import relativedelta

import scipy
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# import plotly.offline as pyo
# Set notebook mode to work in offline
# pyo.init_notebook_mode()

Data & File I/O#

JSON Handling#

Read JSON File#

def json_load(f_path):
    """Loads a local JSON file as a Python dict"""
    with open(f_path) as f:
        json_data = json.load(f)
    return json_data

Flattening (JSON Normalize)#

json_norm_dict = [
     {
         "id": 1,
         "name": "Cole Volk",
         "fitness": {"height": 130, "weight": 60},
     },
     {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
     {
         "id": 2,
         "name": "Faye Raker",
         "fitness": {"height": 130, "weight": 60},
     },
]

pd.json_normalize(json_norm_dict, max_level=0)
id name fitness
0 1.0 Cole Volk {'height': 130, 'weight': 60}
1 NaN Mark Reg {'height': 130, 'weight': 60}
2 2.0 Faye Raker {'height': 130, 'weight': 60}
pd.json_normalize(json_norm_dict, max_level=1)
id name fitness.height fitness.weight
0 1.0 Cole Volk 130 60
1 NaN Mark Reg 130 60
2 2.0 Faye Raker 130 60
json_norm_dict = [
     {
         "state": "Florida",
         "shortname": "FL",
         "info": {"governor": "Rick Scott"},
         "counties": [
             {"name": "Dade", "population": 12345},
             {"name": "Broward", "population": 40000},
             {"name": "Palm Beach", "population": 60000},
         ],
     },
     {
         "state": "Ohio",
         "shortname": "OH",
         "info": {"governor": "John Kasich"},
         "counties": [
             {"name": "Summit", "population": 1234},
             {"name": "Cuyahoga", "population": 1337},
         ],
     },
 ]

pd.json_normalize(json_norm_dict, 
                  record_path="counties",
                  meta=["state", "shortname", ["info", "governor"]])
name population state shortname info.governor
0 Dade 12345 Florida FL Rick Scott
1 Broward 40000 Florida FL Rick Scott
2 Palm Beach 60000 Florida FL Rick Scott
3 Summit 1234 Ohio OH John Kasich
4 Cuyahoga 1337 Ohio OH John Kasich

Load JSON via REST API#

# make GET request to REST API and return as Python dict
def run_rest_get(url, params={}, headers={}, print_summ=True):

    if print_summ:
        print(line_break)
        print("GET Address: %s\nHeaders %s:\nParameters %s:" % (url, repr(headers), repr(params)))

    response = requests.get(url, params=params, headers=headers)
    status_code = response.status_code

    if status_code == 200:
        json_dict = response.json()
    else:
        json_dict = {}

    if print_summ:
        print("Status Code: %d" % response.status_code)
        if type(json_dict) == dict:
            print("Response Keys: %s\n" % json_dict.keys())

    return json_dict, status_code

Read SQL Query w/ Params#

def read_sql(f_path, params={}):
    """Reads a local text file (.sql) and inserts parameters if passed."""
    with open(f_path) as f:
        query = f.read()
    
    if params:
        query = query.format(**params)
    
    return query

I/O tests#

# read JSON
# https://github.com/wmsayer/wmsayer.github.io/blob/master/docs/_bookfiles/data.json
test_json_path = "C:/Users/wsaye/PycharmProjects/wmsayer.github.io/docs/_bookfiles/data.json"
print(json_load(test_json_path))
{'api_key': 'fdghdfghfgfg'}
# read SQL (no params)
# https://github.com/wmsayer/wmsayer.github.io/blob/master/docs/_bookfiles/query.sql
test_sql_path = "C:/Users/wsaye/PycharmProjects/wmsayer.github.io/docs/_bookfiles/query.sql"
print(read_sql(test_sql_path))
SELECT *
FROM my_table
WHERE id = 1
# read SQL (w/ params)
# https://github.com/wmsayer/wmsayer.github.io/blob/master/docs/_bookfiles/query_w_params.sql
test_sql_params_path = "C:/Users/wsaye/PycharmProjects/wmsayer.github.io/docs/_bookfiles/query_w_params.sql"
test_params = {"my_id": 102393, "max_date": "2000/01/01"}
print(read_sql(test_sql_params_path, params=test_params))
SELECT *
FROM my_table
WHERE id = 102393 OR date = '2000/01/01'

Sample REST Dataset (CoinMetrics)#

def get_asset_metrics(assets, metrics, freq, page_size=10000, print_summ=True):
    # freq options 1b, 1s, 1m, 1h, 1d
    # for 'start_time' and 'end_time', formats "2006-01-20T00:00:00Z" and "2006-01-20" are supported among others
    # https://docs.coinmetrics.io/api/v4#operation/getTimeseriesAssetMetrics
    # https://docs.coinmetrics.io/info/metrics

    assets_str = ", ".join(assets)
    metrics_str = ", ".join(metrics)
    
    api_root = 'https://community-api.coinmetrics.io/v4'
    data_key = "data"

    url = "/".join([api_root, "timeseries/asset-metrics"])
    params = {'assets': assets_str, 'metrics': metrics_str, 'frequency': freq, 'page_size': page_size}
    result_dict, status_code = run_rest_get(url, params=params, headers={}, print_summ=print_summ)

    result_df = pd.DataFrame(result_dict[data_key])
    result_df.sort_values(by=["asset", "time"], inplace=True)
    result_df.reset_index(inplace=True, drop=True)

    for m in metrics:
        result_df[m] = result_df[m].astype(float)

    return result_df
# basic method for pulling data from an API and storing in a local cache
# such that when you're debugging you only have to call the API once
def load_asset_metric_data(cache_path, pull_new=True):
    if pull_new:
        df = get_asset_metrics(test_assets, test_metrics, test_freq, print_summ=False)
        df.to_csv(cache_path, index=False)
    else:
        df = pd.read_csv(cache_path)
    return df

Get data w/ cache#

test_assets = ['btc', 'eth']
test_metrics = ['AdrActCnt', 'PriceUSD']
test_freq = '1d'
test_df_cache = "C:/Users/wsaye/PycharmProjects/CashAppInterview/data/cm_test_data.csv"
test_df = load_asset_metric_data(test_df_cache, pull_new=True)
test_df = test_df.dropna(subset=test_metrics).reset_index(drop=True)

test_df["datetime"] = pd.to_datetime(test_df["time"], utc=True) # str timestamp to datetime
test_df["dayname"] = test_df["datetime"].dt.day_name()
test_df["date"] = pd.to_datetime(test_df["time"], utc=True).dt.date # datetime to date
test_df
asset time AdrActCnt PriceUSD datetime dayname date
0 btc 2010-07-18T00:00:00.000000000Z 860.0 0.085840 2010-07-18 00:00:00+00:00 Sunday 2010-07-18
1 btc 2010-07-19T00:00:00.000000000Z 929.0 0.080800 2010-07-19 00:00:00+00:00 Monday 2010-07-19
2 btc 2010-07-20T00:00:00.000000000Z 936.0 0.074736 2010-07-20 00:00:00+00:00 Tuesday 2010-07-20
3 btc 2010-07-21T00:00:00.000000000Z 784.0 0.079193 2010-07-21 00:00:00+00:00 Wednesday 2010-07-21
4 btc 2010-07-22T00:00:00.000000000Z 594.0 0.058470 2010-07-22 00:00:00+00:00 Thursday 2010-07-22
... ... ... ... ... ... ... ...
7324 eth 2023-02-03T00:00:00.000000000Z 457502.0 1665.605250 2023-02-03 00:00:00+00:00 Friday 2023-02-03
7325 eth 2023-02-04T00:00:00.000000000Z 500596.0 1671.046952 2023-02-04 00:00:00+00:00 Saturday 2023-02-04
7326 eth 2023-02-05T00:00:00.000000000Z 537666.0 1631.121142 2023-02-05 00:00:00+00:00 Sunday 2023-02-05
7327 eth 2023-02-06T00:00:00.000000000Z 503556.0 1614.274822 2023-02-06 00:00:00+00:00 Monday 2023-02-06
7328 eth 2023-02-07T00:00:00.000000000Z 477572.0 1672.722311 2023-02-07 00:00:00+00:00 Tuesday 2023-02-07

7329 rows × 7 columns

Pandas Options & Settings#

Docs and available options here

pd.get_option("display.max_rows")
pd.set_option("display.max_rows", 10)
pd.set_option("display.max_columns", 10)

Summarize Dataframe#

General Info#

test_df.describe()  # summary stats of columns
AdrActCnt PriceUSD
count 7.329000e+03 7329.000000
mean 4.345369e+05 5753.152983
std 3.559563e+05 12098.872294
min 4.080000e+02 0.050541
25% 1.013370e+05 121.930348
50% 4.190120e+05 496.317744
75% 6.759150e+05 4595.020018
max 7.157228e+06 67541.755508
test_df.info()  # dataframe schema info, column types
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7329 entries, 0 to 7328
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype              
---  ------     --------------  -----              
 0   asset      7329 non-null   object             
 1   time       7329 non-null   object             
 2   AdrActCnt  7329 non-null   float64            
 3   PriceUSD   7329 non-null   float64            
 4   datetime   7329 non-null   datetime64[ns, UTC]
 5   dayname    7329 non-null   object             
 6   date       7329 non-null   object             
dtypes: datetime64[ns, UTC](1), float64(2), object(4)
memory usage: 400.9+ KB
test_df.dtypes
asset                     object
time                      object
AdrActCnt                float64
PriceUSD                 float64
datetime     datetime64[ns, UTC]
dayname                   object
date                      object
dtype: object
test_df.head()
asset time AdrActCnt PriceUSD datetime dayname date
0 btc 2010-07-18T00:00:00.000000000Z 860.0 0.085840 2010-07-18 00:00:00+00:00 Sunday 2010-07-18
1 btc 2010-07-19T00:00:00.000000000Z 929.0 0.080800 2010-07-19 00:00:00+00:00 Monday 2010-07-19
2 btc 2010-07-20T00:00:00.000000000Z 936.0 0.074736 2010-07-20 00:00:00+00:00 Tuesday 2010-07-20
3 btc 2010-07-21T00:00:00.000000000Z 784.0 0.079193 2010-07-21 00:00:00+00:00 Wednesday 2010-07-21
4 btc 2010-07-22T00:00:00.000000000Z 594.0 0.058470 2010-07-22 00:00:00+00:00 Thursday 2010-07-22
print(type(test_df.loc[0, "time"]))  # type of particular entry
<class 'str'>
test_df.nlargest(5, "PriceUSD")
# test_df.nsmallest(5, "PriceUSD")
asset time AdrActCnt PriceUSD datetime dayname date
4131 btc 2021-11-08T00:00:00.000000000Z 1018796.0 67541.755508 2021-11-08 00:00:00+00:00 Monday 2021-11-08
4132 btc 2021-11-09T00:00:00.000000000Z 1195638.0 67095.585671 2021-11-09 00:00:00+00:00 Tuesday 2021-11-09
4112 btc 2021-10-20T00:00:00.000000000Z 977215.0 66061.796564 2021-10-20 00:00:00+00:00 Wednesday 2021-10-20
4137 btc 2021-11-14T00:00:00.000000000Z 806451.0 65032.225655 2021-11-14 00:00:00+00:00 Sunday 2021-11-14
4134 btc 2021-11-11T00:00:00.000000000Z 1037951.0 64962.931294 2021-11-11 00:00:00+00:00 Thursday 2021-11-11
test_df["asset"].unique()
# test_df["asset"].nunique()
array(['btc', 'eth'], dtype=object)

Crosstab#

cross_df = test_df.loc[test_df["asset"]== "btc", ["datetime", "dayname", "PriceUSD"]].copy().dropna()
cross_df = cross_df.sort_values(by="datetime")
cross_df["7d_SMA"] = cross_df["PriceUSD"].rolling(7).mean()
cross_df["beating_SMA"] = cross_df["PriceUSD"] > cross_df["7d_SMA"]
cross_df["return"] = cross_df["PriceUSD"].pct_change()
cross_df.dropna(inplace=True)
cross_df
datetime dayname PriceUSD 7d_SMA beating_SMA return
6 2010-07-24 00:00:00+00:00 Saturday 0.054540 0.070596 False -0.099894
7 2010-07-25 00:00:00+00:00 Sunday 0.050541 0.065553 False -0.073329
8 2010-07-26 00:00:00+00:00 Monday 0.056000 0.062010 False 0.108020
9 2010-07-27 00:00:00+00:00 Tuesday 0.058622 0.059708 False 0.046822
10 2010-07-28 00:00:00+00:00 Wednesday 0.058911 0.056811 True 0.004931
... ... ... ... ... ... ...
4583 2023-02-03 00:00:00+00:00 Friday 23445.813177 23341.700805 True -0.002544
4584 2023-02-04 00:00:00+00:00 Saturday 23355.524386 23391.231508 False -0.003851
4585 2023-02-05 00:00:00+00:00 Sunday 22957.225191 23274.407054 False -0.017054
4586 2023-02-06 00:00:00+00:00 Monday 22745.165037 23266.655308 False -0.009237
4587 2023-02-07 00:00:00+00:00 Tuesday 23266.269204 23286.114921 False 0.022911

4582 rows × 6 columns

pd.crosstab(cross_df['beating_SMA'], cross_df['dayname'])
dayname Friday Monday Saturday Sunday Thursday Tuesday Wednesday
beating_SMA
False 271 288 295 296 278 300 284
True 383 367 360 359 376 355 370
pd.crosstab(cross_df['beating_SMA'], cross_df['dayname'], normalize=True)
dayname Friday Monday Saturday Sunday Thursday Tuesday Wednesday
beating_SMA
False 0.059144 0.062855 0.064382 0.064601 0.060672 0.065474 0.061982
True 0.083588 0.080096 0.078568 0.078350 0.082060 0.077477 0.080751
pd.crosstab(cross_df['beating_SMA'], cross_df['dayname'], values=cross_df['return'], aggfunc=np.mean)
dayname Friday Monday Saturday Sunday Thursday Tuesday Wednesday
beating_SMA
False -0.018078 -0.020847 -0.012298 -0.009072 -0.020434 -0.017126 -0.017303
True 0.019876 0.025667 0.017170 0.008909 0.021925 0.024923 0.022291

Sort/ Rank#

sort_df = test_df[["date", "asset", "PriceUSD"]].copy()
sort_df['price_rank'] = sort_df["PriceUSD"].rank(ascending=True, pct=False)
sort_df['price_pct'] = sort_df["PriceUSD"].rank(ascending=True, pct=True)
sort_df
date asset PriceUSD price_rank price_pct
0 2010-07-18 btc 0.085840 83.0 0.011325
1 2010-07-19 btc 0.080800 81.0 0.011052
2 2010-07-20 btc 0.074736 79.0 0.010779
3 2010-07-21 btc 0.079193 80.0 0.010916
4 2010-07-22 btc 0.058470 5.0 0.000682
... ... ... ... ... ...
7324 2023-02-03 eth 1665.605250 4727.0 0.644972
7325 2023-02-04 eth 1671.046952 4728.0 0.645108
7326 2023-02-05 eth 1631.121142 4710.0 0.642652
7327 2023-02-06 eth 1614.274822 4699.0 0.641152
7328 2023-02-07 eth 1672.722311 4730.0 0.645381

7329 rows × 5 columns

sort_df.sort_values(by="price_rank", ascending=False)
date asset PriceUSD price_rank price_pct
4131 2021-11-08 btc 67541.755508 7329.0 1.000000
4132 2021-11-09 btc 67095.585671 7328.0 0.999864
4112 2021-10-20 btc 66061.796564 7327.0 0.999727
4137 2021-11-14 btc 65032.225655 7326.0 0.999591
4134 2021-11-11 btc 64962.931294 7325.0 0.999454
... ... ... ... ... ...
4 2010-07-22 btc 0.058470 5.0 0.000682
17 2010-08-04 btc 0.057016 4.0 0.000546
8 2010-07-26 btc 0.056000 3.0 0.000409
6 2010-07-24 btc 0.054540 2.0 0.000273
7 2010-07-25 btc 0.050541 1.0 0.000136

7329 rows × 5 columns

Cleaning#

Deleting Rows/Columns here

Replace#

replace_df = test_df[["date", "asset", "dayname"]].copy()
# replace_df.replace("Sunday", "Sun")
replace_df.replace({"Sunday": "S",  "Monday": "M", "Tuesday": "T"})
date asset dayname
0 2010-07-18 btc S
1 2010-07-19 btc M
2 2010-07-20 btc T
3 2010-07-21 btc Wednesday
4 2010-07-22 btc Thursday
... ... ... ...
7324 2023-02-03 eth Friday
7325 2023-02-04 eth Saturday
7326 2023-02-05 eth S
7327 2023-02-06 eth M
7328 2023-02-07 eth T

7329 rows × 3 columns

Drop/Fill NA()#

cleaning_df = test_df[["date", "asset", "PriceUSD"]].pivot(index="date", columns="asset", values="PriceUSD")
cleaning_df
asset btc eth
date
2010-07-18 0.085840 NaN
2010-07-19 0.080800 NaN
2010-07-20 0.074736 NaN
2010-07-21 0.079193 NaN
2010-07-22 0.058470 NaN
... ... ...
2023-02-03 23445.813177 1665.605250
2023-02-04 23355.524386 1671.046952
2023-02-05 22957.225191 1631.121142
2023-02-06 22745.165037 1614.274822
2023-02-07 23266.269204 1672.722311

4588 rows × 2 columns

# cleaning_df.dropna()  # drops N/A looking in all columns
cleaning_df.dropna(subset=["eth"])  # drops N/A in subset only
asset btc eth
date
2015-08-08 261.450276 1.199990
2015-08-09 266.342020 1.199990
2015-08-10 264.928825 1.199990
2015-08-11 271.421736 0.990000
2015-08-12 268.143868 1.288000
... ... ...
2023-02-03 23445.813177 1665.605250
2023-02-04 23355.524386 1671.046952
2023-02-05 22957.225191 1631.121142
2023-02-06 22745.165037 1614.274822
2023-02-07 23266.269204 1672.722311

2741 rows × 2 columns

cleaning_df.fillna(-1)
asset btc eth
date
2010-07-18 0.085840 -1.000000
2010-07-19 0.080800 -1.000000
2010-07-20 0.074736 -1.000000
2010-07-21 0.079193 -1.000000
2010-07-22 0.058470 -1.000000
... ... ...
2023-02-03 23445.813177 1665.605250
2023-02-04 23355.524386 1671.046952
2023-02-05 22957.225191 1631.121142
2023-02-06 22745.165037 1614.274822
2023-02-07 23266.269204 1672.722311

4588 rows × 2 columns

cleaning_df.fillna(method="ffill")
asset btc eth
date
2010-07-18 0.085840 NaN
2010-07-19 0.080800 NaN
2010-07-20 0.074736 NaN
2010-07-21 0.079193 NaN
2010-07-22 0.058470 NaN
... ... ...
2023-02-03 23445.813177 1665.605250
2023-02-04 23355.524386 1671.046952
2023-02-05 22957.225191 1631.121142
2023-02-06 22745.165037 1614.274822
2023-02-07 23266.269204 1672.722311

4588 rows × 2 columns

cleaning_df.fillna(method="bfill")
asset btc eth
date
2010-07-18 0.085840 1.199990
2010-07-19 0.080800 1.199990
2010-07-20 0.074736 1.199990
2010-07-21 0.079193 1.199990
2010-07-22 0.058470 1.199990
... ... ...
2023-02-03 23445.813177 1665.605250
2023-02-04 23355.524386 1671.046952
2023-02-05 22957.225191 1631.121142
2023-02-06 22745.165037 1614.274822
2023-02-07 23266.269204 1672.722311

4588 rows × 2 columns

# setup df for interpolation
interp_df = cleaning_df.iloc[cleaning_df.shape[0] - 5:, :].copy()
interp_df["btc_og"] = interp_df["btc"]
interp_df["eth_og"] = interp_df["eth"]
interp_df.iloc[1, 0:2] = [np.nan ,np.nan]

interp_df.interpolate(method="linear")
asset btc eth btc_og eth_og
date
2023-02-03 23445.813177 1665.605250 23445.813177 1665.605250
2023-02-04 23201.519184 1648.363196 23355.524386 1671.046952
2023-02-05 22957.225191 1631.121142 22957.225191 1631.121142
2023-02-06 22745.165037 1614.274822 22745.165037 1614.274822
2023-02-07 23266.269204 1672.722311 23266.269204 1672.722311

Selecting/Sampling#

test_df.select_dtypes(include='float64')
AdrActCnt PriceUSD
0 860.0 0.085840
1 929.0 0.080800
2 936.0 0.074736
3 784.0 0.079193
4 594.0 0.058470
... ... ...
7324 457502.0 1665.605250
7325 500596.0 1671.046952
7326 537666.0 1631.121142
7327 503556.0 1614.274822
7328 477572.0 1672.722311

7329 rows × 2 columns

# test_df.sample(n = 200)
test_df.sample(frac = 0.25, random_state=42)
asset time AdrActCnt PriceUSD datetime dayname date
2576 btc 2017-08-06T00:00:00.000000000Z 547079.0 3230.116709 2017-08-06 00:00:00+00:00 Sunday 2017-08-06
1565 btc 2014-10-30T00:00:00.000000000Z 205736.0 344.609382 2014-10-30 00:00:00+00:00 Thursday 2014-10-30
865 btc 2012-11-29T00:00:00.000000000Z 41205.0 12.497662 2012-11-29 00:00:00+00:00 Thursday 2012-11-29
4461 btc 2022-10-04T00:00:00.000000000Z 932869.0 20339.971281 2022-10-04 00:00:00+00:00 Tuesday 2022-10-04
3728 btc 2020-10-01T00:00:00.000000000Z 922880.0 10606.617333 2020-10-01 00:00:00+00:00 Thursday 2020-10-01
... ... ... ... ... ... ... ...
367 btc 2011-07-20T00:00:00.000000000Z 25461.0 13.678331 2011-07-20 00:00:00+00:00 Wednesday 2011-07-20
3520 btc 2020-03-07T00:00:00.000000000Z 682432.0 8898.306957 2020-03-07 00:00:00+00:00 Saturday 2020-03-07
5824 eth 2018-12-26T00:00:00.000000000Z 288376.0 130.309451 2018-12-26 00:00:00+00:00 Wednesday 2018-12-26
4362 btc 2022-06-27T00:00:00.000000000Z 953524.0 20751.051696 2022-06-27 00:00:00+00:00 Monday 2022-06-27
4280 btc 2022-04-06T00:00:00.000000000Z 919776.0 43304.937199 2022-04-06 00:00:00+00:00 Wednesday 2022-04-06

1832 rows × 7 columns

Boolean Selection#

bool_df = test_df[["date", "asset", "PriceUSD"]].pivot(index="date", columns="asset", values="PriceUSD")
bool_df
asset btc eth
date
2010-07-18 0.085840 NaN
2010-07-19 0.080800 NaN
2010-07-20 0.074736 NaN
2010-07-21 0.079193 NaN
2010-07-22 0.058470 NaN
... ... ...
2023-02-03 23445.813177 1665.605250
2023-02-04 23355.524386 1671.046952
2023-02-05 22957.225191 1631.121142
2023-02-06 22745.165037 1614.274822
2023-02-07 23266.269204 1672.722311

4588 rows × 2 columns

# returns Series of same shape w/ np.NaN at failing rows (default)
# bool_df['PriceUSD'].where(bool_df['PriceUSD'] > 10**4)  # returns np.Nan in failing rows
bool_df['eth'].where(bool_df['eth'] > 10**3, 0)  # returns 0 in failing rows
date
2010-07-18       0.000000
2010-07-19       0.000000
2010-07-20       0.000000
2010-07-21       0.000000
2010-07-22       0.000000
                 ...     
2023-02-03    1665.605250
2023-02-04    1671.046952
2023-02-05    1631.121142
2023-02-06    1614.274822
2023-02-07    1672.722311
Name: eth, Length: 4588, dtype: float64
test_df["asset"].isin(["btc"])
0        True
1        True
2        True
3        True
4        True
        ...  
7324    False
7325    False
7326    False
7327    False
7328    False
Name: asset, Length: 7329, dtype: bool
na_check_df = bool_df.isna()
na_check_series = na_check_df.any(axis=1)  # aggregate booleans
bool_df.loc[na_check_series, :]
asset btc eth
date
2010-07-18 0.085840 NaN
2010-07-19 0.080800 NaN
2010-07-20 0.074736 NaN
2010-07-21 0.079193 NaN
2010-07-22 0.058470 NaN
... ... ...
2015-08-03 282.185052 NaN
2015-08-04 285.286617 NaN
2015-08-05 282.338887 NaN
2015-08-06 278.995749 NaN
2015-08-07 279.488715 NaN

1847 rows × 2 columns

bool_df.loc[(bool_df['eth'] > 10**3), :]
asset btc eth
date
2018-01-06 17103.589280 1006.157475
2018-01-07 16231.694999 1102.889537
2018-01-08 14937.415089 1134.699060
2018-01-09 14378.586217 1288.406875
2018-01-10 14669.088266 1231.767295
... ... ...
2023-02-03 23445.813177 1665.605250
2023-02-04 23355.524386 1671.046952
2023-02-05 22957.225191 1631.121142
2023-02-06 22745.165037 1614.274822
2023-02-07 23266.269204 1672.722311

789 rows × 2 columns

Boolean Operators#

bool_df.loc[~(bool_df['eth'] > 10**3), :]  # NOT
asset btc eth
date
2010-07-18 0.085840 NaN
2010-07-19 0.080800 NaN
2010-07-20 0.074736 NaN
2010-07-21 0.079193 NaN
2010-07-22 0.058470 NaN
... ... ...
2020-12-31 29022.671413 739.025850
2021-01-01 29380.693733 730.914321
2021-01-02 32022.681058 775.296622
2021-01-03 33277.835305 990.365325
2022-06-18 19013.867254 992.790097

3799 rows × 2 columns

bool_df.loc[(bool_df['eth'] > 10**3) & (bool_df['eth'] > 10), :]  # AND
asset btc eth
date
2018-01-06 17103.589280 1006.157475
2018-01-07 16231.694999 1102.889537
2018-01-08 14937.415089 1134.699060
2018-01-09 14378.586217 1288.406875
2018-01-10 14669.088266 1231.767295
... ... ...
2023-02-03 23445.813177 1665.605250
2023-02-04 23355.524386 1671.046952
2023-02-05 22957.225191 1631.121142
2023-02-06 22745.165037 1614.274822
2023-02-07 23266.269204 1672.722311

789 rows × 2 columns

bool_df.loc[(bool_df['eth'] > 10**3) | (bool_df['eth'] > 10), :]  # OR
asset btc eth
date
2016-03-05 397.314454 10.751006
2016-03-06 403.705187 10.984407
2016-03-09 412.540704 11.817099
2016-03-10 416.340929 11.165651
2016-03-11 419.511934 11.122905
... ... ...
2023-02-03 23445.813177 1665.605250
2023-02-04 23355.524386 1671.046952
2023-02-05 22957.225191 1631.121142
2023-02-06 22745.165037 1614.274822
2023-02-07 23266.269204 1672.722311

2438 rows × 2 columns

Datetime#

Python datetime <-> string formatting here

Datetime to string#

Python:

test_date_str = "2022-01-01"
print(dt.datetime.strptime(test_date_str, "%Y-%m-%d"))
print(dt.datetime.strptime(test_date_str, "%Y-%m-%d").date())
2022-01-01 00:00:00
2022-01-01

Pandas:

test_df["datetime"] = pd.to_datetime(test_df["time"], utc=True) # str timestamp to datetime
test_df['datetime_alt'] = pd.to_datetime(test_df["time"], format='%Y-%m-%dT%H:%M:%S.%fZ', utc=True)
test_df["date"] = pd.to_datetime(test_df["time"], utc=True).dt.date # datetime to date
test_df['datetime_str'] = test_df["datetime"].dt.strftime('%Y-%m-%dT%H:%M:%S.%fZ')  # datetime to str
test_df['date_str'] = pd.to_datetime(test_df["date"]).dt.strftime('%Y-%m-%d')  # date to str

test_df[["datetime", "datetime_alt", "date", "datetime_str", "date_str"]]
datetime datetime_alt date datetime_str date_str
0 2010-07-18 00:00:00+00:00 2010-07-18 00:00:00+00:00 2010-07-18 2010-07-18T00:00:00.000000Z 2010-07-18
1 2010-07-19 00:00:00+00:00 2010-07-19 00:00:00+00:00 2010-07-19 2010-07-19T00:00:00.000000Z 2010-07-19
2 2010-07-20 00:00:00+00:00 2010-07-20 00:00:00+00:00 2010-07-20 2010-07-20T00:00:00.000000Z 2010-07-20
3 2010-07-21 00:00:00+00:00 2010-07-21 00:00:00+00:00 2010-07-21 2010-07-21T00:00:00.000000Z 2010-07-21
4 2010-07-22 00:00:00+00:00 2010-07-22 00:00:00+00:00 2010-07-22 2010-07-22T00:00:00.000000Z 2010-07-22
... ... ... ... ... ...
7324 2023-02-03 00:00:00+00:00 2023-02-03 00:00:00+00:00 2023-02-03 2023-02-03T00:00:00.000000Z 2023-02-03
7325 2023-02-04 00:00:00+00:00 2023-02-04 00:00:00+00:00 2023-02-04 2023-02-04T00:00:00.000000Z 2023-02-04
7326 2023-02-05 00:00:00+00:00 2023-02-05 00:00:00+00:00 2023-02-05 2023-02-05T00:00:00.000000Z 2023-02-05
7327 2023-02-06 00:00:00+00:00 2023-02-06 00:00:00+00:00 2023-02-06 2023-02-06T00:00:00.000000Z 2023-02-06
7328 2023-02-07 00:00:00+00:00 2023-02-07 00:00:00+00:00 2023-02-07 2023-02-07T00:00:00.000000Z 2023-02-07

7329 rows × 5 columns

Datetime Altering#

Python:

test_date_str = "2022-01-01"
test_dt = dt.datetime.strptime(test_date_str, "%Y-%m-%d")
print(test_dt + dt.timedelta(days=2))
print(test_dt + relativedelta(years=5, months=4))
2022-01-03 00:00:00
2027-05-01 00:00:00

Pandas:

# (also pd.Timedelta)
test_df["date_offset"] = test_df["date"] + pd.DateOffset(years=2, months=2, days=1)

test_df[["date", "date_offset"]]
date date_offset
0 2010-07-18 2012-09-19
1 2010-07-19 2012-09-20
2 2010-07-20 2012-09-21
3 2010-07-21 2012-09-22
4 2010-07-22 2012-09-23
... ... ...
7324 2023-02-03 2025-04-04
7325 2023-02-04 2025-04-05
7326 2023-02-05 2025-04-06
7327 2023-02-06 2025-04-07
7328 2023-02-07 2025-04-08

7329 rows × 2 columns

Date Parts (Week & Weekday)#

Python:

test_date_str = "2022-01-01"
test_dt = dt.datetime.strptime(test_date_str, "%Y-%m-%d")
print(test_dt.isocalendar().week)
print(test_dt.weekday())
print(test_dt.strftime('%A'))  # get day name
52
5
Saturday

Pandas:

test_df["week"] = test_df["datetime"].dt.isocalendar().week
test_df["weekday"] = test_df["datetime"].dt.weekday
test_df["dayname"] = test_df["datetime"].dt.day_name()
test_df["monthname"] = test_df["datetime"].dt.month_name()
test_df["year"] = test_df["datetime"].dt.year

test_df[["date", "week", "weekday", "dayname", "monthname", "year"]]
date week weekday dayname monthname year
0 2010-07-18 28 6 Sunday July 2010
1 2010-07-19 29 0 Monday July 2010
2 2010-07-20 29 1 Tuesday July 2010
3 2010-07-21 29 2 Wednesday July 2010
4 2010-07-22 29 3 Thursday July 2010
... ... ... ... ... ... ...
7324 2023-02-03 5 4 Friday February 2023
7325 2023-02-04 5 5 Saturday February 2023
7326 2023-02-05 5 6 Sunday February 2023
7327 2023-02-06 6 0 Monday February 2023
7328 2023-02-07 6 1 Tuesday February 2023

7329 rows × 6 columns

Numerical#

num_df = test_df.loc[3000:, ["date", "asset", "PriceUSD"]].copy()
num_df["PriceUSD_rnd"] = num_df["PriceUSD"].round()
num_df["PriceUSD_rnd1"] = num_df["PriceUSD"].round(1)
num_df["PriceUSD_floor"] = np.floor(num_df["PriceUSD"])
num_df["PriceUSD_ceil"] = np.ceil(num_df["PriceUSD"])
num_df
date asset PriceUSD PriceUSD_rnd PriceUSD_rnd1 PriceUSD_floor PriceUSD_ceil
3000 2018-10-04 btc 6545.285668 6545.0 6545.3 6545.0 6546.0
3001 2018-10-05 btc 6585.149580 6585.0 6585.1 6585.0 6586.0
3002 2018-10-06 btc 6550.478316 6550.0 6550.5 6550.0 6551.0
3003 2018-10-07 btc 6564.568260 6565.0 6564.6 6564.0 6565.0
3004 2018-10-08 btc 6604.685274 6605.0 6604.7 6604.0 6605.0
... ... ... ... ... ... ... ...
7324 2023-02-03 eth 1665.605250 1666.0 1665.6 1665.0 1666.0
7325 2023-02-04 eth 1671.046952 1671.0 1671.0 1671.0 1672.0
7326 2023-02-05 eth 1631.121142 1631.0 1631.1 1631.0 1632.0
7327 2023-02-06 eth 1614.274822 1614.0 1614.3 1614.0 1615.0
7328 2023-02-07 eth 1672.722311 1673.0 1672.7 1672.0 1673.0

4329 rows × 7 columns

Transforms#

Indexes#

  • df.set_index(keys, drop=True, verify_integrity=False)

  • df.reset_index(drop=False)

  • df.reindex()

Pivot & Melt#

  • pd.unstack() - pivot multilevel index

Pivot to MultiIndex#

pivot_df = test_df.pivot(index="date", columns="asset", values=['AdrActCnt', 'PriceUSD'])
pivot_df.reset_index(drop=False, inplace=True)
pivot_df
date AdrActCnt PriceUSD
asset btc eth btc eth
0 2010-07-18 860.0 NaN 0.085840 NaN
1 2010-07-19 929.0 NaN 0.080800 NaN
2 2010-07-20 936.0 NaN 0.074736 NaN
3 2010-07-21 784.0 NaN 0.079193 NaN
4 2010-07-22 594.0 NaN 0.058470 NaN
... ... ... ... ... ...
4583 2023-02-03 988867.0 457502.0 23445.813177 1665.605250
4584 2023-02-04 961115.0 500596.0 23355.524386 1671.046952
4585 2023-02-05 802171.0 537666.0 22957.225191 1631.121142
4586 2023-02-06 920525.0 503556.0 22745.165037 1614.274822
4587 2023-02-07 944049.0 477572.0 23266.269204 1672.722311

4588 rows × 5 columns

Melt from MultiIndex#

# pivot_df = pd.melt(pivot_df, col_level=0, id_vars=["date"])
pivot_df = pd.melt(pivot_df, id_vars=[("date", "")])
pivot_df.columns = ["date", "metric", "asset", "value"]
pivot_df
date metric asset value
0 2010-07-18 AdrActCnt btc 860.000000
1 2010-07-19 AdrActCnt btc 929.000000
2 2010-07-20 AdrActCnt btc 936.000000
3 2010-07-21 AdrActCnt btc 784.000000
4 2010-07-22 AdrActCnt btc 594.000000
... ... ... ... ...
18347 2023-02-03 PriceUSD eth 1665.605250
18348 2023-02-04 PriceUSD eth 1671.046952
18349 2023-02-05 PriceUSD eth 1631.121142
18350 2023-02-06 PriceUSD eth 1614.274822
18351 2023-02-07 PriceUSD eth 1672.722311

18352 rows × 4 columns

Pivot back to OG (single Index)#

pivot_df = pivot_df.pivot(index=["date", "asset"], columns="metric", values="value")
pivot_df.columns = pivot_df.columns.rename("")
pivot_df.reset_index(drop=False, inplace=True)
pivot_df
date asset AdrActCnt PriceUSD
0 2010-07-18 btc 860.0 0.085840
1 2010-07-18 eth NaN NaN
2 2010-07-19 btc 929.0 0.080800
3 2010-07-19 eth NaN NaN
4 2010-07-20 btc 936.0 0.074736
... ... ... ... ...
9171 2023-02-05 eth 537666.0 1631.121142
9172 2023-02-06 btc 920525.0 22745.165037
9173 2023-02-06 eth 503556.0 1614.274822
9174 2023-02-07 btc 944049.0 23266.269204
9175 2023-02-07 eth 477572.0 1672.722311

9176 rows × 4 columns

Pivot to Date Index (Fill missing dates)#

date_rng = pd.date_range(test_df["date"].min(), test_df["date"].max())
pivot_fill_df = test_df.pivot(index="date", columns="asset", values='PriceUSD')
# pivot_fill_df = pivot_fill_df.drop(labels=[1, 3, 4524], axis=0)  # drop by index value
pivot_fill_df = pivot_fill_df.drop(pivot_fill_df.index[[1, 3, 4524]])  # drop by row num
pivot_fill_df = pivot_fill_df.reindex(date_rng, fill_value=np.nan)
pivot_fill_df
asset btc eth
2010-07-18 0.085840 NaN
2010-07-19 NaN NaN
2010-07-20 0.074736 NaN
2010-07-21 NaN NaN
2010-07-22 0.058470 NaN
... ... ...
2023-02-03 23445.813177 1665.605250
2023-02-04 23355.524386 1671.046952
2023-02-05 22957.225191 1631.121142
2023-02-06 22745.165037 1614.274822
2023-02-07 23266.269204 1672.722311

4588 rows × 2 columns

Join & Merge#

Merge:

  • can join on indices or columns

  • validate - check 1:1, 1:many, etc. (also available for join)

  • indicator - produces additional column to indicate “left_only”, “right_only”, or “both”

join_merge_df = test_df.loc[3000:, ["date", "datetime", "asset", 'AdrActCnt', 'PriceUSD']].copy()
join_merge_df = join_merge_df.sort_values(by="date").reset_index(drop=True)
join_merge_df
date datetime asset AdrActCnt PriceUSD
0 2015-08-08 2015-08-08 00:00:00+00:00 eth 1208.0 1.199990
1 2015-08-09 2015-08-09 00:00:00+00:00 eth 1113.0 1.199990
2 2015-08-10 2015-08-10 00:00:00+00:00 eth 1430.0 1.199990
3 2015-08-11 2015-08-11 00:00:00+00:00 eth 2697.0 0.990000
4 2015-08-12 2015-08-12 00:00:00+00:00 eth 1219.0 1.288000
... ... ... ... ... ...
4324 2023-02-05 2023-02-05 00:00:00+00:00 eth 537666.0 1631.121142
4325 2023-02-06 2023-02-06 00:00:00+00:00 eth 503556.0 1614.274822
4326 2023-02-06 2023-02-06 00:00:00+00:00 btc 920525.0 22745.165037
4327 2023-02-07 2023-02-07 00:00:00+00:00 btc 944049.0 23266.269204
4328 2023-02-07 2023-02-07 00:00:00+00:00 eth 477572.0 1672.722311

4329 rows × 5 columns

join_cols = ["date", "asset"]
join_merge_df1 = join_merge_df.loc[:3000, join_cols + ["AdrActCnt"]]
join_merge_df2 = join_merge_df.loc[:, join_cols + ["PriceUSD"]]
print(join_merge_df1)
print(join_merge_df2)
            date asset  AdrActCnt
0     2015-08-08   eth     1208.0
1     2015-08-09   eth     1113.0
2     2015-08-10   eth     1430.0
3     2015-08-11   eth     2697.0
4     2015-08-12   eth     1219.0
...          ...   ...        ...
2996  2021-04-12   eth   649920.0
2997  2021-04-13   btc  1178027.0
2998  2021-04-13   eth   641476.0
2999  2021-04-14   eth   662089.0
3000  2021-04-14   btc  1149773.0

[3001 rows x 3 columns]
            date asset      PriceUSD
0     2015-08-08   eth      1.199990
1     2015-08-09   eth      1.199990
2     2015-08-10   eth      1.199990
3     2015-08-11   eth      0.990000
4     2015-08-12   eth      1.288000
...          ...   ...           ...
4324  2023-02-05   eth   1631.121142
4325  2023-02-06   eth   1614.274822
4326  2023-02-06   btc  22745.165037
4327  2023-02-07   btc  23266.269204
4328  2023-02-07   eth   1672.722311

[4329 rows x 3 columns]
joined_df = join_merge_df1.join(join_merge_df2.set_index(keys=join_cols), how="outer", on=join_cols)
joined_df
date asset AdrActCnt PriceUSD
0 2015-08-08 eth 1208.0 1.199990
1 2015-08-09 eth 1113.0 1.199990
2 2015-08-10 eth 1430.0 1.199990
3 2015-08-11 eth 2697.0 0.990000
4 2015-08-12 eth 1219.0 1.288000
... ... ... ... ...
3000 2023-02-05 eth NaN 1631.121142
3000 2023-02-06 eth NaN 1614.274822
3000 2023-02-06 btc NaN 22745.165037
3000 2023-02-07 btc NaN 23266.269204
3000 2023-02-07 eth NaN 1672.722311

4329 rows × 4 columns

merged_df = join_merge_df1.merge(join_merge_df2, how="outer", on=join_cols, indicator=True)
merged_df
date asset AdrActCnt PriceUSD _merge
0 2015-08-08 eth 1208.0 1.199990 both
1 2015-08-09 eth 1113.0 1.199990 both
2 2015-08-10 eth 1430.0 1.199990 both
3 2015-08-11 eth 2697.0 0.990000 both
4 2015-08-12 eth 1219.0 1.288000 both
... ... ... ... ... ...
4324 2023-02-05 eth NaN 1631.121142 right_only
4325 2023-02-06 eth NaN 1614.274822 right_only
4326 2023-02-06 btc NaN 22745.165037 right_only
4327 2023-02-07 btc NaN 23266.269204 right_only
4328 2023-02-07 eth NaN 1672.722311 right_only

4329 rows × 5 columns

Explode#

explode_df = pd.DataFrame({"city": ['A', 'B', 'C'],
                   "day1": [22, 25, 21],
                   'day2':[31, 12, 67],
                   'day3': [27, 20, 15],
                   'day4': [34, 37, [41, 45, 67, 90, 21]],
                   'day5': [23, 54, 36]})
explode_df
city day1 day2 day3 day4 day5
0 A 22 31 27 34 23
1 B 25 12 20 37 54
2 C 21 67 15 [41, 45, 67, 90, 21] 36
explode_df.explode("day4", ignore_index=False)
city day1 day2 day3 day4 day5
0 A 22 31 27 34 23
1 B 25 12 20 37 54
2 C 21 67 15 41 36
2 C 21 67 15 45 36
2 C 21 67 15 67 36
2 C 21 67 15 90 36
2 C 21 67 15 21 36

Aggregation#

Aggregation functions#

  • mean(): Compute mean of groups

  • sum(): Compute sum of group values

  • size(): Compute group sizes

  • count(): Compute count of group

  • std(): Standard deviation of groups

  • var(): Compute variance of groups

  • sem(): Standard error of the mean of groups

  • first(): Compute first of group values

  • last(): Compute last of group values

  • nth() : Take nth value, or a subset if n is a list

  • min(): Compute min of group values

  • max(): Compute max of group values

agg_df = test_df[test_metrics]
agg_df.count()
AdrActCnt    7329
PriceUSD     7329
dtype: int64
agg_df.nunique()
AdrActCnt    7268
PriceUSD     7278
dtype: int64
agg_df.median()
AdrActCnt    419012.000000
PriceUSD        496.317744
dtype: float64
agg_df.quantile(q=0.10)
AdrActCnt    14817.800000
PriceUSD         5.596396
Name: 0.1, dtype: float64

Groupby#

DataFrame.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=_NoDefault.no_default, observed=False, dropna=True)

group_df = test_df[["date", "year", "asset"] + test_metrics]
group_df.groupby(by="asset").nunique()
date year AdrActCnt PriceUSD
asset
btc 4588 14 4562 4565
eth 2741 9 2725 2716
group_df.groupby(by=["year", "asset"]).nunique()
date AdrActCnt PriceUSD
year asset
2010 btc 167 151 154
2011 btc 365 363 356
2012 btc 366 363 366
2013 btc 365 365 365
2014 btc 365 365 365
... ... ... ... ...
2021 eth 365 365 365
2022 btc 365 365 365
eth 365 364 365
2023 btc 38 38 38
eth 38 38 38

23 rows × 3 columns

Groupby Map#

agg_map = {'AdrActCnt':['count', 'nunique'], 
           'PriceUSD':['max', 'min', lambda x: x.max() - x.min()]
          }
group_df.groupby(by=["year", "asset"]).agg(agg_map)
AdrActCnt PriceUSD
count nunique max min <lambda_0>
year asset
2010 btc 167 151 0.400982 0.050541 0.350442
2011 btc 365 363 29.029921 0.295000 28.734921
2012 btc 366 363 13.755252 4.255239 9.500012
2013 btc 365 365 1134.932231 13.280607 1121.651624
2014 btc 365 365 914.459961 310.442004 604.017957
... ... ... ... ... ... ...
2021 eth 365 365 4811.156463 730.914321 4080.242142
2022 btc 365 365 47560.009382 15758.291282 31801.718100
eth 365 364 3832.365610 992.790097 2839.575513
2023 btc 38 38 23774.996368 16606.752043 7168.244325
eth 38 38 1672.722311 1200.005919 472.716392

23 rows × 5 columns

Groupby Datetime#

DataFrame.groupby(pd.Grouper(key=”dfgfgdf”, axis=0, freq=’M’))

group_dt_df = test_df.pivot(index="datetime", columns="asset", values=test_metrics)
group_dt_df
AdrActCnt PriceUSD
asset btc eth btc eth
datetime
2010-07-18 00:00:00+00:00 860.0 NaN 0.085840 NaN
2010-07-19 00:00:00+00:00 929.0 NaN 0.080800 NaN
2010-07-20 00:00:00+00:00 936.0 NaN 0.074736 NaN
2010-07-21 00:00:00+00:00 784.0 NaN 0.079193 NaN
2010-07-22 00:00:00+00:00 594.0 NaN 0.058470 NaN
... ... ... ... ...
2023-02-03 00:00:00+00:00 988867.0 457502.0 23445.813177 1665.605250
2023-02-04 00:00:00+00:00 961115.0 500596.0 23355.524386 1671.046952
2023-02-05 00:00:00+00:00 802171.0 537666.0 22957.225191 1631.121142
2023-02-06 00:00:00+00:00 920525.0 503556.0 22745.165037 1614.274822
2023-02-07 00:00:00+00:00 944049.0 477572.0 23266.269204 1672.722311

4588 rows × 4 columns

group_dt_df.groupby(pd.Grouper(axis=0, freq='M')).last()
AdrActCnt PriceUSD
asset btc eth btc eth
datetime
2010-07-31 00:00:00+00:00 479.0 NaN 0.067546 NaN
2010-08-31 00:00:00+00:00 733.0 NaN 0.060000 NaN
2010-09-30 00:00:00+00:00 640.0 NaN 0.061900 NaN
2010-10-31 00:00:00+00:00 609.0 NaN 0.192500 NaN
2010-11-30 00:00:00+00:00 817.0 NaN 0.208200 NaN
... ... ... ... ...
2022-10-31 00:00:00+00:00 980614.0 476137.0 20490.858181 1571.203871
2022-11-30 00:00:00+00:00 946754.0 525545.0 17176.898691 1298.039420
2022-12-31 00:00:00+00:00 839910.0 519101.0 16524.217802 1195.330879
2023-01-31 00:00:00+00:00 944348.0 519409.0 23130.051913 1584.821083
2023-02-28 00:00:00+00:00 944049.0 477572.0 23266.269204 1672.722311

152 rows × 4 columns

group_dt_df.groupby(pd.Grouper(axis=0, freq='Y')).first()
AdrActCnt PriceUSD
asset btc eth btc eth
datetime
2010-12-31 00:00:00+00:00 860.0 NaN 0.085840 NaN
2011-12-31 00:00:00+00:00 1071.0 NaN 0.300000 NaN
2012-12-31 00:00:00+00:00 11474.0 NaN 5.294843 NaN
2013-12-31 00:00:00+00:00 38733.0 NaN 13.331371 NaN
2014-12-31 00:00:00+00:00 96516.0 NaN 752.404550 NaN
... ... ... ... ...
2019-12-31 00:00:00+00:00 433715.0 227755.0 3808.117832 139.154644
2020-12-31 00:00:00+00:00 524360.0 231794.0 7170.631869 129.963875
2021-12-31 00:00:00+00:00 1001890.0 511250.0 29380.693733 730.914321
2022-12-31 00:00:00+00:00 695722.0 581311.0 47560.009382 3761.059640
2023-12-31 00:00:00+00:00 719716.0 522461.0 16606.752043 1200.005919

14 rows × 4 columns

group_dt_df_2 = test_df[["datetime", "asset", "AdrActCnt"]]
group_dt_df_2
datetime asset AdrActCnt
0 2010-07-18 00:00:00+00:00 btc 860.0
1 2010-07-19 00:00:00+00:00 btc 929.0
2 2010-07-20 00:00:00+00:00 btc 936.0
3 2010-07-21 00:00:00+00:00 btc 784.0
4 2010-07-22 00:00:00+00:00 btc 594.0
... ... ... ...
7324 2023-02-03 00:00:00+00:00 eth 457502.0
7325 2023-02-04 00:00:00+00:00 eth 500596.0
7326 2023-02-05 00:00:00+00:00 eth 537666.0
7327 2023-02-06 00:00:00+00:00 eth 503556.0
7328 2023-02-07 00:00:00+00:00 eth 477572.0

7329 rows × 3 columns

group_dt_df_2.groupby(["asset", pd.Grouper(key="datetime", freq='Y')]).first()
AdrActCnt
asset datetime
btc 2010-12-31 00:00:00+00:00 860.0
2011-12-31 00:00:00+00:00 1071.0
2012-12-31 00:00:00+00:00 11474.0
2013-12-31 00:00:00+00:00 38733.0
2014-12-31 00:00:00+00:00 96516.0
... ... ...
eth 2019-12-31 00:00:00+00:00 227755.0
2020-12-31 00:00:00+00:00 231794.0
2021-12-31 00:00:00+00:00 511250.0
2022-12-31 00:00:00+00:00 581311.0
2023-12-31 00:00:00+00:00 522461.0

23 rows × 1 columns

Rolling/Window#

DataFrame.rolling(window, min_periods=None, center=False, win_type=None, on=None, axis=0, closed=None, step=None, method=’single’)

Shorthand methods:

  • cumsum()

  • pct_change()

rolling_df = test_df.pivot(index="datetime", columns="asset", values=test_metrics)
rolling_df
AdrActCnt PriceUSD
asset btc eth btc eth
datetime
2010-07-18 00:00:00+00:00 860.0 NaN 0.085840 NaN
2010-07-19 00:00:00+00:00 929.0 NaN 0.080800 NaN
2010-07-20 00:00:00+00:00 936.0 NaN 0.074736 NaN
2010-07-21 00:00:00+00:00 784.0 NaN 0.079193 NaN
2010-07-22 00:00:00+00:00 594.0 NaN 0.058470 NaN
... ... ... ... ...
2023-02-03 00:00:00+00:00 988867.0 457502.0 23445.813177 1665.605250
2023-02-04 00:00:00+00:00 961115.0 500596.0 23355.524386 1671.046952
2023-02-05 00:00:00+00:00 802171.0 537666.0 22957.225191 1631.121142
2023-02-06 00:00:00+00:00 920525.0 503556.0 22745.165037 1614.274822
2023-02-07 00:00:00+00:00 944049.0 477572.0 23266.269204 1672.722311

4588 rows × 4 columns

rolling_df.rolling(5, min_periods=3, center=False).mean()
AdrActCnt PriceUSD
asset btc eth btc eth
datetime
2010-07-18 00:00:00+00:00 NaN NaN NaN NaN
2010-07-19 00:00:00+00:00 NaN NaN NaN NaN
2010-07-20 00:00:00+00:00 908.333333 NaN 0.080459 NaN
2010-07-21 00:00:00+00:00 877.250000 NaN 0.080142 NaN
2010-07-22 00:00:00+00:00 820.600000 NaN 0.075808 NaN
... ... ... ... ...
2023-02-03 00:00:00+00:00 985010.400000 502227.2 23321.619961 1620.930898
2023-02-04 00:00:00+00:00 985760.800000 499124.4 23432.839386 1641.942616
2023-02-05 00:00:00+00:00 957325.400000 502775.8 23398.274041 1651.202627
2023-02-06 00:00:00+00:00 931324.800000 500124.8 23201.866527 1645.677170
2023-02-07 00:00:00+00:00 923345.400000 495378.4 23153.999399 1650.954095

4588 rows × 4 columns

Gaussian Window:

num = 18
std_dev = 4.5
window = scipy.signal.windows.gaussian(num, std=std_dev)
z_score = np.linspace(-num/2/std_dev, num/2/std_dev, num)
plt.plot(z_score, window)
plt.title(r"Gaussian window ($\sigma$=7)")
plt.ylabel("Amplitude")
plt.xlabel("Z-Score")
plt.figure()
<Figure size 640x480 with 0 Axes>
_images/pandas_cheatsheet_122_1.png
<Figure size 640x480 with 0 Axes>

Binning#

cut#

Bins are of equal size (width), but number of entries per bin may not similar.

# returns pd.Series
pd.cut(test_df['PriceUSD'], bins=5).value_counts()
(-67.491, 13508.392]      6459
(13508.392, 27016.733]     339
(40525.074, 54033.415]     211
(27016.733, 40525.074]     208
(54033.415, 67541.756]     112
Name: PriceUSD, dtype: int64

qcut#

Bins are not of equal size (width), but number of entries per bin are similar.

# returns pd.Series
pd.qcut(test_df['PriceUSD'], q=5).value_counts()
(0.0495, 16.825]         1466
(16.825, 294.001]        1466
(1095.26, 7548.085]      1466
(7548.085, 67541.756]    1466
(294.001, 1095.26]       1465
Name: PriceUSD, dtype: int64

Strings#

Regex cheat sheet here

Python string formatting cookbook

nba_df = pd.read_csv("C:/Users/wsaye/PycharmProjects/CashAppInterview/data/nba.csv")
nba_df.dropna(inplace=True)
nba_df
Name Team Number Position Age Height Weight College Salary
0 Avery Bradley Boston Celtics 0.0 PG 25.0 6-2 180.0 Texas 7730337.0
1 Jae Crowder Boston Celtics 99.0 SF 25.0 6-6 235.0 Marquette 6796117.0
3 R.J. Hunter Boston Celtics 28.0 SG 22.0 6-5 185.0 Georgia State 1148640.0
6 Jordan Mickey Boston Celtics 55.0 PF 21.0 6-8 235.0 LSU 1170960.0
7 Kelly Olynyk Boston Celtics 41.0 C 25.0 7-0 238.0 Gonzaga 2165160.0
... ... ... ... ... ... ... ... ... ...
449 Rodney Hood Utah Jazz 5.0 SG 23.0 6-8 206.0 Duke 1348440.0
451 Chris Johnson Utah Jazz 23.0 SF 26.0 6-6 206.0 Dayton 981348.0
452 Trey Lyles Utah Jazz 41.0 PF 20.0 6-10 234.0 Kentucky 2239800.0
453 Shelvin Mack Utah Jazz 8.0 PG 26.0 6-3 203.0 Butler 2433333.0
456 Jeff Withey Utah Jazz 24.0 C 26.0 7-0 231.0 Kansas 947276.0

364 rows × 9 columns

Search/Replace#

# does not assume regex
# (nba_df["Name"].str.find("a") >= 1).sum()
nba_df["Name"].str.find("J") >= 1
0      False
1      False
3       True
6      False
7      False
       ...  
449    False
451     True
452    False
453    False
456    False
Name: Name, Length: 364, dtype: bool
# assumes regex
# nba_df["Name"].str.contains("^[jJ]")
# nba_df["Name"].str.contains("^(Jo)")
nba_df["Name"].str.extract("^(Jo)")
0
0 NaN
1 NaN
3 NaN
6 Jo
7 NaN
... ...
449 NaN
451 NaN
452 NaN
453 NaN
456 NaN

364 rows × 1 columns

nba_df["Name"].str.replace("^(Jo)", "PORKY", regex=True)
0         Avery Bradley
1           Jae Crowder
3           R.J. Hunter
6      PORKYrdan Mickey
7          Kelly Olynyk
             ...       
449         Rodney Hood
451       Chris Johnson
452          Trey Lyles
453        Shelvin Mack
456         Jeff Withey
Name: Name, Length: 364, dtype: object

Split, Concat#

# assumes regex if len > 1, else literal
nba_df["Name"].str.split(" ")
0      [Avery, Bradley]
1        [Jae, Crowder]
3        [R.J., Hunter]
6      [Jordan, Mickey]
7       [Kelly, Olynyk]
             ...       
449      [Rodney, Hood]
451    [Chris, Johnson]
452       [Trey, Lyles]
453     [Shelvin, Mack]
456      [Jeff, Withey]
Name: Name, Length: 364, dtype: object
# assumes regex if len > 1, else literal
nba_df["Name"].str.split(" ", n=1, expand=True)
0 1
0 Avery Bradley
1 Jae Crowder
3 R.J. Hunter
6 Jordan Mickey
7 Kelly Olynyk
... ... ...
449 Rodney Hood
451 Chris Johnson
452 Trey Lyles
453 Shelvin Mack
456 Jeff Withey

364 rows × 2 columns

nba_df["Name"].str.split(" ").str.len().describe()
count    364.000000
mean       2.032967
std        0.220220
min        2.000000
25%        2.000000
50%        2.000000
75%        2.000000
max        5.000000
Name: Name, dtype: float64
nba_df["Name"].str.cat(others=[nba_df["Team"], nba_df["College"]], sep="-")
0            Avery Bradley-Boston Celtics-Texas
1          Jae Crowder-Boston Celtics-Marquette
3      R.J. Hunter-Boston Celtics-Georgia State
6              Jordan Mickey-Boston Celtics-LSU
7           Kelly Olynyk-Boston Celtics-Gonzaga
                         ...                   
449                  Rodney Hood-Utah Jazz-Duke
451              Chris Johnson-Utah Jazz-Dayton
452               Trey Lyles-Utah Jazz-Kentucky
453               Shelvin Mack-Utah Jazz-Butler
456                Jeff Withey-Utah Jazz-Kansas
Name: Name, Length: 364, dtype: object

Formatting#

# nba_df["Name"].str.upper()
nba_df["Name"].str.lower()
0      avery bradley
1        jae crowder
3        r.j. hunter
6      jordan mickey
7       kelly olynyk
           ...      
449      rodney hood
451    chris johnson
452       trey lyles
453     shelvin mack
456      jeff withey
Name: Name, Length: 364, dtype: object
nba_df['Salary_str'] = (nba_df['Salary']/10**6).map('${:,.2f}M'.format)  # for single Series
nba_df['Salary_str']
# cohort_crosstab = cohort_crosstab.applymap('{:,.0f}%'.format)  # for entire df
0      $7.73M
1      $6.80M
3      $1.15M
6      $1.17M
7      $2.17M
        ...  
449    $1.35M
451    $0.98M
452    $2.24M
453    $2.43M
456    $0.95M
Name: Salary_str, Length: 364, dtype: object

Modeling#

Linear Regression#

lin_reg_df = pd.read_csv('C:/Users/wsaye/PycharmProjects/CashAppInterview/data/lin_reg_test_data.csv')  # load data set
X = lin_reg_df.iloc[:, 0].values.reshape(-1, 1)  # values converts it into a numpy array
Y = lin_reg_df.iloc[:, 1].values.reshape(-1, 1)  # -1 means that calculate the dimension of rows, but have 1 column
COLOR = lin_reg_df.iloc[:, 2].values.reshape(-1, 1)

ohe = OneHotEncoder(sparse_output=False)
ohe_vals = ohe.fit_transform(COLOR)

X_mat = np.concatenate([X, ohe_vals], axis=1)
def regression_results(y_true, y_pred, lin_reg):

    # Regression metrics
    explained_variance = metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error = metrics.mean_absolute_error(y_true, y_pred) 
    mse = metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error = metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error = metrics.median_absolute_error(y_true, y_pred)
    r2 = metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r^2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))
    print(f"Coefficients: {lin_reg.coef_}")
    print(f"Intercept: {lin_reg.intercept_}")

Test/Train Split#

X_train, X_test, y_train, y_test = train_test_split(X_mat, Y, test_size = 0.25)

Fit, Predict, Summarize#

regr = LinearRegression()

regr.fit(X_train, y_train)
y_test_pred = regr.predict(X_test)
lin_reg_df["y_pred"] = regr.predict(X_mat)

regression_results(y_test, y_test_pred, regr)
explained_variance:  0.955
mean_squared_log_error:  0.0135
r^2:  0.9548
MAE:  7.8925
MSE:  103.3243
RMSE:  10.1649
Coefficients: [[   1.30312052  110.24915657 -120.19158265    9.94242608]]
Intercept: [-1.35003848]

K-Means#

from sklearn.cluster import KMeans

X_ktrain, X_ktest, y_ktrain, y_ktest = train_test_split(X, Y, test_size = 0.25)

train_set = np.concatenate([X_ktrain, y_ktrain], axis=1)
test_set = np.concatenate([X_ktest, y_ktest], axis=1)

kmeans = KMeans(n_clusters=3, random_state=0, n_init="auto").fit(train_set)
train_labels = kmeans.labels_
test_labels = kmeans.predict(test_set)

Plot Model:

fig = make_subplots(rows=1, cols=2)
fig.add_trace(go.Scatter(x=X_ktrain[:, 0], y=y_ktrain[:, 0], mode='markers', name='train',
                         marker=dict(color=train_labels, line_color="black", line_width=1)), row=1, col=1)
fig.add_trace(go.Scatter(x=X_ktest[:, 0], y=y_ktest[:, 0], mode='markers', name='test',
                         marker=dict(color=test_labels, line_color="black", line_width=1, symbol="x")), row=1, col=2)
fig.show()

Plotting#

Matplotlib#

# plt.scatter(X, Y)
# plt.plot(X, Y_pred, color="red", linestyle='None', marker="x")
plt.scatter(lin_reg_df["x"], lin_reg_df["y"])
plt.plot(lin_reg_df["x"], lin_reg_df["y_pred"], color="green", linestyle='None', marker="x")

plt.show()
_images/pandas_cheatsheet_154_0.png

Plotly (Standard)#

fig = go.Figure()
fig.add_trace(go.Scatter(x=X[:, 0], y=Y[:, 0], mode='markers', name='raw data', marker=dict(color="grey")))
# fig.add_trace(go.Scatter(x=X[:, 0], y=Y_pred[:, 0], mode='markers', name='prediction', marker=dict(color=COLOR[:, 0])))

for c in list(np.unique(COLOR[:, 0])):
    temp_x = lin_reg_df.loc[lin_reg_df["color"]==c, "x"]
    temp_y = lin_reg_df.loc[lin_reg_df["color"]==c, "y_pred"]
    fig.add_trace(go.Scatter(x=temp_x, y=temp_y, mode='lines', name='pred-' + c, line_color=c))

fig.show()

Plotly Express#

fig = px.scatter(lin_reg_df, x="x", y="y", color="color")
fig.show()
fig = px.scatter(lin_reg_df, x="x", y="y", color="color", 
                 facet_col="color", 
#                  facet_row="time", 
#                  trendline="ols"
                )
fig.show()

Sample Analyses#

Cohort#

rand_gen = np.random.RandomState(2021)  # set seed

start_date = dt.datetime.strptime("2022-01-01", "%Y-%m-%d")
end_date = dt.datetime.strptime("2022-01-10", "%Y-%m-%d")
date_rng = pd.date_range(start_date, end_date).values

total_days = len(date_rng)
num_users = 1000
user_df_list = []

for u in range(0, num_users):
    num_active_days = rand_gen.randint(low=2, high=total_days)
    active_days_index = rand_gen.randint(low=0, high=total_days, size=(1, num_active_days))
    active_dates = pd.Series(date_rng[active_days_index[0, :]])
    user_id = pd.Series([u]*num_active_days)
    user_df = pd.concat([active_dates, user_id], axis=1)
    user_df.columns = ["date", "user_id"]
    user_df_list.append(user_df)

cohort_df = pd.concat(user_df_list)
cohort_df
date user_id
0 2022-01-06 0
1 2022-01-10 0
2 2022-01-01 0
3 2022-01-07 0
4 2022-01-06 0
... ... ...
2 2022-01-10 999
3 2022-01-06 999
4 2022-01-10 999
5 2022-01-03 999
6 2022-01-10 999

5443 rows × 2 columns

first_date = cohort_df.groupby(by=["user_id"]).min().rename(columns={"date": "start_date"})
cohort_df = cohort_df.join(first_date, on="user_id", how="left")
cohort_df
date user_id start_date
0 2022-01-06 0 2022-01-01
1 2022-01-10 0 2022-01-01
2 2022-01-01 0 2022-01-01
3 2022-01-07 0 2022-01-01
4 2022-01-06 0 2022-01-01
... ... ... ...
2 2022-01-10 999 2022-01-03
3 2022-01-06 999 2022-01-03
4 2022-01-10 999 2022-01-03
5 2022-01-03 999 2022-01-03
6 2022-01-10 999 2022-01-03

5443 rows × 3 columns

cohort_crosstab = pd.crosstab(cohort_df['start_date'], cohort_df['date'])
cohort_totals = np.diag(cohort_crosstab).reshape(-1, 1)

cohort_crosstab[cohort_crosstab.columns] = 100 * cohort_crosstab.values / cohort_totals
cohort_crosstab = cohort_crosstab.applymap('{:,.0f}%'.format)
cohort_crosstab
date 2022-01-01 2022-01-02 2022-01-03 2022-01-04 2022-01-05 2022-01-06 2022-01-07 2022-01-08 2022-01-09 2022-01-10
start_date
2022-01-01 100% 39% 45% 43% 42% 45% 47% 47% 34% 42%
2022-01-02 0% 100% 44% 42% 45% 41% 43% 45% 38% 42%
2022-01-03 0% 0% 100% 46% 32% 39% 31% 42% 37% 40%
2022-01-04 0% 0% 0% 100% 38% 43% 33% 39% 35% 43%
2022-01-05 0% 0% 0% 0% 100% 42% 38% 64% 38% 39%
2022-01-06 0% 0% 0% 0% 0% 100% 24% 37% 44% 44%
2022-01-07 0% 0% 0% 0% 0% 0% 100% 47% 63% 42%
2022-01-08 0% 0% 0% 0% 0% 0% 0% 100% 25% 31%
2022-01-09 0% 0% 0% 0% 0% 0% 0% 0% 100% 167%
2022-01-10 0% 0% 0% 0% 0% 0% 0% 0% 0% 100%

Funnel#

stages = ["Website visit", "Downloads", "Potential customers", "Requested price", "Invoice sent"]
df_mtl = pd.DataFrame(dict(number=[39, 27.4, 20.6, 11, 3], stage=stages))
df_mtl['office'] = 'Montreal'
df_toronto = pd.DataFrame(dict(number=[52, 36, 18, 14, 5], stage=stages))
df_toronto['office'] = 'Toronto'
df = pd.concat([df_mtl, df_toronto], axis=0)

fig = px.funnel(df, x='number', y='stage', color='office')
display(fig)
fig2 = make_subplots()
fig2.add_trace(go.Funnel(y = stages,
                         x = [49, 29, 26, 11, 2], 
                         textinfo = "value+percent initial"))
fig2