Skip to content

Commit fd9f4d8

Browse files
committed
Cleaned up betaVIX and applied override
1 parent 50c869e commit fd9f4d8

File tree

3 files changed

+143
-152
lines changed

3 files changed

+143
-152
lines changed

Signals/pyCode/DataDownloads/VIX.py

Lines changed: 57 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -12,100 +12,71 @@
1212
"""
1313

1414
import os
15+
1516
import pandas as pd
16-
import numpy as np
1717
import requests
1818
from dotenv import load_dotenv
19-
import sys
20-
import os
21-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
22-
from config import MAX_ROWS_DL
19+
20+
FRED_URL = "https://api.stlouisfed.org/fred/series/observations"
21+
OUTPUT_PATH = "../pyData/Intermediate/d_vix.parquet"
22+
23+
print("=" * 60, flush=True)
24+
print("VIX.py - FRED Volatility Series", flush=True)
25+
print("=" * 60, flush=True)
2326

2427
load_dotenv()
28+
print("Environment variables loaded.", flush=True)
2529

2630

2731
def download_fred_series(series_id, api_key):
28-
# Set up FRED API request parameters
29-
url = "https://api.stlouisfed.org/fred/series/observations"
32+
"""Pull a single FRED series as a tidy DataFrame."""
3033
params = {
31-
'series_id': series_id,
32-
'api_key': api_key,
33-
'file_type': 'json',
34-
'observation_start': '1900-01-01'
34+
"series_id": series_id,
35+
"api_key": api_key,
36+
"file_type": "json",
37+
"observation_start": "1900-01-01",
3538
}
36-
37-
print(f"Downloading {series_id}...")
38-
response = requests.get(url, params=params, timeout=30)
39+
response = requests.get(FRED_URL, params=params, timeout=30)
3940
response.raise_for_status()
40-
data = response.json()
41-
42-
# Process successful response
43-
df = pd.DataFrame(data['observations'])
44-
45-
# Clean and format the data
46-
df['date'] = pd.to_datetime(df['date'])
47-
df['value'] = pd.to_numeric(df['value'], errors='coerce')
48-
df = df[['date', 'value']]
49-
df.columns = ['date', series_id]
50-
print(f"Successfully downloaded {len(df)} observations")
51-
return df
52-
53-
54-
print("Downloading VIX data from FRED...")
55-
56-
# Get FRED API key from environment
57-
fred_api_key = os.getenv("FRED_API_KEY")
58-
59-
# Download both VIX series
60-
vxocls_data = download_fred_series('VXOCLS', fred_api_key) # VXO (older series)
61-
vixcls_data = download_fred_series('VIXCLS', fred_api_key) # VIX (current series)
62-
63-
# Merge the two series
64-
vix_data = pd.merge(vxocls_data, vixcls_data, on='date', how='outer')
65-
vix_data = vix_data.sort_values('date').reset_index(drop=True)
66-
67-
# Create combined VIX series (equivalent to Stata logic)
68-
cutoff_date = pd.to_datetime('2021-09-23')
69-
vix_data['vix'] = vix_data['VXOCLS']
70-
71-
# Fill with VIXCLS for missing VXOCLS values after cutoff date
72-
post_cutoff = vix_data['date'] >= cutoff_date
73-
missing_vxo = vix_data['VXOCLS'].isna()
74-
fill_mask = post_cutoff & missing_vxo
75-
vix_data.loc[fill_mask, 'vix'] = vix_data.loc[fill_mask, 'VIXCLS']
76-
77-
# Keep only necessary columns and rename date first
78-
final_data = vix_data[['date', 'vix']].copy()
79-
final_data = final_data.rename(columns={'date': 'time_d'})
80-
81-
# Apply precision control to match Stata format
82-
final_data['vix'] = final_data['vix'].astype('float32')
83-
84-
# Calculate daily change in VIX (equivalent to gen dVIX = vix - l.vix)
85-
final_data['dVIX'] = final_data['vix'].diff().astype('float32')
86-
87-
# Apply row limit for debugging if configured
88-
if MAX_ROWS_DL > 0:
89-
final_data = final_data.head(MAX_ROWS_DL)
90-
print(f"DEBUG MODE: Limited to {MAX_ROWS_DL} rows")
91-
92-
# Save the data
93-
final_data.to_parquet("../pyData/Intermediate/d_vix.parquet")
94-
95-
# Print summary information
96-
print(f"VIX data saved with {len(final_data)} records")
97-
date_min = final_data['time_d'].min().strftime('%Y-%m-%d')
98-
date_max = final_data['time_d'].max().strftime('%Y-%m-%d')
99-
print(f"Date range: {date_min} to {date_max}")
100-
101-
print("\nSample data:")
102-
print(final_data.head())
103-
104-
print("\nVIX summary:")
105-
print(f"Total records: {len(final_data)}")
106-
print(f"Missing VIX values: {final_data['vix'].isna().sum()}")
107-
print(f"Missing dVIX values: {final_data['dVIX'].isna().sum()}")
108-
print(f"Mean: {final_data['vix'].mean():.2f}")
109-
print(f"Std: {final_data['vix'].std():.2f}")
110-
print(f"Min: {final_data['vix'].min():.2f}")
111-
print(f"Max: {final_data['vix'].max():.2f}")
41+
observations = response.json()["observations"]
42+
43+
# Build DataFrame with parsed dates and numeric values for the requested series
44+
df = pd.DataFrame(observations)
45+
df["date"] = pd.to_datetime(df["date"])
46+
df["value"] = pd.to_numeric(df["value"], errors="coerce")
47+
return df.rename(columns={"value": series_id})[["date", series_id]]
48+
49+
50+
print("Downloading VIX data from FRED...", flush=True)
51+
api_key = os.getenv("FRED_API_KEY")
52+
53+
if not api_key:
54+
# Fail fast so the user knows credentials are missing before making requests
55+
raise ValueError("FRED_API_KEY not found in environment variables")
56+
57+
vxocls = download_fred_series("VXOCLS", api_key)
58+
vixcls = download_fred_series("VIXCLS", api_key)
59+
60+
print(f"Downloaded {len(vxocls)} VXO observations and {len(vixcls)} VIX observations.", flush=True)
61+
62+
cutoff = pd.Timestamp("2021-09-23")
63+
# Merge both series and take VXO up to the cutoff, VIX afterwards to build a continuous history
64+
vix_data = vxocls.merge(vixcls, on="date", how="outer").sort_values("date")
65+
66+
vix_data["vix"] = vix_data["VXOCLS"]
67+
fill_mask = (vix_data["date"] >= cutoff) & vix_data["VXOCLS"].isna()
68+
vix_data.loc[fill_mask, "vix"] = vix_data.loc[fill_mask, "VIXCLS"]
69+
70+
# Compute daily changes for the blended series and persist to parquet
71+
final_data = vix_data[["date", "vix"]].rename(columns={"date": "time_d"})
72+
final_data["vix"] = final_data["vix"].astype("float32")
73+
final_data["dVIX"] = final_data["vix"].diff().astype("float32")
74+
final_data.to_parquet(OUTPUT_PATH)
75+
76+
date_min = final_data["time_d"].min().date()
77+
date_max = final_data["time_d"].max().date()
78+
print(f"Saved {len(final_data)} rows to {OUTPUT_PATH}", flush=True)
79+
print(f"Date range: {date_min} to {date_max}", flush=True)
80+
print("=" * 60, flush=True)
81+
print("VIX.py completed successfully", flush=True)
82+
print("=" * 60, flush=True)

Signals/pyCode/Predictors/ZZ2_betaVIX.py

Lines changed: 65 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -14,44 +14,50 @@
1414
- betaVIX = coefficient on daily change in VIX from 1-month rolling regression (20-day window, min 15 obs)
1515
"""
1616

17+
import os
18+
import sys
19+
20+
import pandas as pd
1721
import polars as pl
1822
import polars_ols as pls # Registers .least_squares namespace
19-
import sys
20-
import os
2123

2224
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
2325
from utils.save_standardized import save_predictor
2426

25-
print("Starting ZZ2_betaVIX.py...")
26-
27-
# Data load
28-
print("Loading data...")
29-
daily_crsp = pl.read_parquet("../pyData/Intermediate/dailyCRSP.parquet")
30-
daily_ff = pl.read_parquet("../pyData/Intermediate/dailyFF.parquet")
31-
d_vix = pl.read_parquet("../pyData/Intermediate/d_vix.parquet")
32-
33-
# Select required columns
34-
df = daily_crsp.select(["permno", "time_d", "ret"])
35-
36-
# Merge with FF data
37-
df = df.join(daily_ff.select(["time_d", "rf", "mktrf"]), on="time_d", how="inner")
3827

39-
# Calculate excess return
40-
df = df.with_columns([(pl.col("ret") - pl.col("rf")).alias("ret_excess")])
28+
print("=" * 80)
29+
print("ZZ2_betaVIX.py")
30+
print("Generating betaVIX predictor from daily market and VIX data")
31+
print("=" * 80)
4132

42-
# Merge with VIX data
43-
df = df.join(d_vix.select(["time_d", "dVIX"]), on="time_d", how="inner")
44-
45-
# Critical: Sort data first (from Beta.py success pattern)
46-
df = df.sort(["permno", "time_d"])
33+
# DATA LOAD
34+
print("Loading daily datasets...")
35+
print("Loading dailyCRSP.parquet...")
36+
daily_crsp = pl.read_parquet("../pyData/Intermediate/dailyCRSP.parquet")
37+
print(f"Loaded daily CRSP observations: {len(daily_crsp):,}")
4738

48-
# Set up time index for rolling window
49-
df = df.with_columns([pl.int_range(pl.len()).over("permno").alias("time_temp")])
39+
print("Loading dailyFF.parquet...")
40+
daily_ff = pl.read_parquet("../pyData/Intermediate/dailyFF.parquet")
41+
print(f"Loaded daily Fama-French observations: {len(daily_ff):,}")
5042

51-
# Use direct polars-ols for rolling regression
52-
# Rolling regression of excess returns on market factor and VIX changes using 20-day window with minimum 15 observations
43+
print("Loading d_vix.parquet...")
44+
d_vix = pl.read_parquet("../pyData/Intermediate/d_vix.parquet")
45+
print(f"Loaded daily VIX change observations: {len(d_vix):,}")
46+
47+
# MERGE DATA SOURCES
48+
print("Merging CRSP returns with factors and VIX changes...")
49+
df = (
50+
daily_crsp.select(["permno", "time_d", "ret"])
51+
.join(daily_ff.select(["time_d", "rf", "mktrf"]), on="time_d", how="inner")
52+
.with_columns((pl.col("ret") - pl.col("rf")).alias("ret_excess"))
53+
.join(d_vix.select(["time_d", "dVIX"]), on="time_d", how="inner")
54+
.sort(["permno", "time_d"])
55+
)
56+
print(f"Combined daily panel observations: {len(df):,}")
57+
print(f"Unique permnos in panel: {df['permno'].n_unique():,}")
5358

54-
# Sort is already done above
59+
# ROLLING REGRESSION
60+
print("Running rolling 20-day regressions (min 15 obs) per permno...")
5561
df = df.with_columns(
5662
pl.col("ret_excess")
5763
.least_squares.rolling_ols(
@@ -65,29 +71,36 @@
6571
)
6672
.over("permno")
6773
.alias("coef")
68-
).with_columns(
69-
[
70-
pl.col("coef").struct.field("const").alias("b_const"),
71-
pl.col("coef").struct.field("mktrf").alias("b_mktrf"),
72-
pl.col("coef").struct.field("dVIX").alias("b_dVIX"),
73-
]
74+
).with_columns(pl.col("coef").struct.field("dVIX").alias("betaVIX"))
75+
print("Extracted betaVIX coefficients from rolling regressions")
76+
77+
# MONTHLY AGGREGATION
78+
print("Aggregating daily coefficients to month-end values...")
79+
monthly = (
80+
df.drop("coef")
81+
.with_columns(pl.col("time_d").dt.truncate("1mo").alias("time_avail_m"))
82+
.sort(["permno", "time_avail_m", "time_d"])
83+
.group_by(["permno", "time_avail_m"])
84+
.agg(pl.col("betaVIX").drop_nulls().last().alias("betaVIX"))
85+
.select(["permno", "time_avail_m", "betaVIX"])
7486
)
75-
76-
# Extract betaVIX coefficient from dVIX regression term
77-
df = df.with_columns([pl.col("b_dVIX").alias("betaVIX")])
78-
79-
# Convert to monthly and keep last observation per month
80-
df = df.with_columns([pl.col("time_d").dt.truncate("1mo").alias("time_avail_m")])
81-
82-
# Keep last non-missing betaVIX per permno-month
83-
df = df.sort(["permno", "time_avail_m", "time_d"])
84-
df = df.group_by(["permno", "time_avail_m"]).agg(
85-
[pl.col("betaVIX").drop_nulls().last().alias("betaVIX")]
86-
)
87-
88-
# Select final data
89-
result = df.select(["permno", "time_avail_m", "betaVIX"])
90-
91-
# Save predictor
92-
save_predictor(result, "betaVIX")
93-
print("ZZ2_betaVIX.py completed successfully")
87+
print(f"Monthly betaVIX rows: {len(monthly):,}")
88+
89+
if len(monthly) > 0:
90+
monthly_pd = monthly.to_pandas()
91+
print("betaVIX summary stats:")
92+
print(f" Mean: {monthly_pd['betaVIX'].mean():.6f}")
93+
print(f" Std: {monthly_pd['betaVIX'].std():.6f}")
94+
print(f" Min: {monthly_pd['betaVIX'].min():.6f}")
95+
print(f" Max: {monthly_pd['betaVIX'].max():.6f}")
96+
97+
# SAVE OUTPUT
98+
print("Saving betaVIX predictor...")
99+
save_predictor(monthly_pd, "betaVIX")
100+
print("betaVIX predictor saved")
101+
else:
102+
print("No betaVIX values produced; skipping save")
103+
104+
print("=" * 80)
105+
print("betaVIX pipeline complete")
106+
print("=" * 80)

0 commit comments

Comments
 (0)