|
12 | 12 | """
|
13 | 13 |
|
14 | 14 | import os
|
| 15 | + |
15 | 16 | import pandas as pd
|
16 |
| -import numpy as np |
17 | 17 | import requests
|
18 | 18 | from dotenv import load_dotenv
|
19 |
| -import sys |
20 |
| -import os |
21 |
| -sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) |
22 |
| -from config import MAX_ROWS_DL |
| 19 | + |
| 20 | +FRED_URL = "https://api.stlouisfed.org/fred/series/observations" |
| 21 | +OUTPUT_PATH = "../pyData/Intermediate/d_vix.parquet" |
| 22 | + |
| 23 | +print("=" * 60, flush=True) |
| 24 | +print("VIX.py - FRED Volatility Series", flush=True) |
| 25 | +print("=" * 60, flush=True) |
23 | 26 |
|
24 | 27 | load_dotenv()
|
| 28 | +print("Environment variables loaded.", flush=True) |
25 | 29 |
|
26 | 30 |
|
27 | 31 | def download_fred_series(series_id, api_key):
|
28 |
| - # Set up FRED API request parameters |
29 |
| - url = "https://api.stlouisfed.org/fred/series/observations" |
| 32 | + """Pull a single FRED series as a tidy DataFrame.""" |
30 | 33 | params = {
|
31 |
| - 'series_id': series_id, |
32 |
| - 'api_key': api_key, |
33 |
| - 'file_type': 'json', |
34 |
| - 'observation_start': '1900-01-01' |
| 34 | + "series_id": series_id, |
| 35 | + "api_key": api_key, |
| 36 | + "file_type": "json", |
| 37 | + "observation_start": "1900-01-01", |
35 | 38 | }
|
36 |
| - |
37 |
| - print(f"Downloading {series_id}...") |
38 |
| - response = requests.get(url, params=params, timeout=30) |
| 39 | + response = requests.get(FRED_URL, params=params, timeout=30) |
39 | 40 | response.raise_for_status()
|
40 |
| - data = response.json() |
41 |
| - |
42 |
| - # Process successful response |
43 |
| - df = pd.DataFrame(data['observations']) |
44 |
| - |
45 |
| - # Clean and format the data |
46 |
| - df['date'] = pd.to_datetime(df['date']) |
47 |
| - df['value'] = pd.to_numeric(df['value'], errors='coerce') |
48 |
| - df = df[['date', 'value']] |
49 |
| - df.columns = ['date', series_id] |
50 |
| - print(f"Successfully downloaded {len(df)} observations") |
51 |
| - return df |
52 |
| - |
53 |
| - |
54 |
| -print("Downloading VIX data from FRED...") |
55 |
| - |
56 |
| -# Get FRED API key from environment |
57 |
| -fred_api_key = os.getenv("FRED_API_KEY") |
58 |
| - |
59 |
| -# Download both VIX series |
60 |
| -vxocls_data = download_fred_series('VXOCLS', fred_api_key) # VXO (older series) |
61 |
| -vixcls_data = download_fred_series('VIXCLS', fred_api_key) # VIX (current series) |
62 |
| - |
63 |
| -# Merge the two series |
64 |
| -vix_data = pd.merge(vxocls_data, vixcls_data, on='date', how='outer') |
65 |
| -vix_data = vix_data.sort_values('date').reset_index(drop=True) |
66 |
| - |
67 |
| -# Create combined VIX series (equivalent to Stata logic) |
68 |
| -cutoff_date = pd.to_datetime('2021-09-23') |
69 |
| -vix_data['vix'] = vix_data['VXOCLS'] |
70 |
| - |
71 |
| -# Fill with VIXCLS for missing VXOCLS values after cutoff date |
72 |
| -post_cutoff = vix_data['date'] >= cutoff_date |
73 |
| -missing_vxo = vix_data['VXOCLS'].isna() |
74 |
| -fill_mask = post_cutoff & missing_vxo |
75 |
| -vix_data.loc[fill_mask, 'vix'] = vix_data.loc[fill_mask, 'VIXCLS'] |
76 |
| - |
77 |
| -# Keep only necessary columns and rename date first |
78 |
| -final_data = vix_data[['date', 'vix']].copy() |
79 |
| -final_data = final_data.rename(columns={'date': 'time_d'}) |
80 |
| - |
81 |
| -# Apply precision control to match Stata format |
82 |
| -final_data['vix'] = final_data['vix'].astype('float32') |
83 |
| - |
84 |
| -# Calculate daily change in VIX (equivalent to gen dVIX = vix - l.vix) |
85 |
| -final_data['dVIX'] = final_data['vix'].diff().astype('float32') |
86 |
| - |
87 |
| -# Apply row limit for debugging if configured |
88 |
| -if MAX_ROWS_DL > 0: |
89 |
| - final_data = final_data.head(MAX_ROWS_DL) |
90 |
| - print(f"DEBUG MODE: Limited to {MAX_ROWS_DL} rows") |
91 |
| - |
92 |
| -# Save the data |
93 |
| -final_data.to_parquet("../pyData/Intermediate/d_vix.parquet") |
94 |
| - |
95 |
| -# Print summary information |
96 |
| -print(f"VIX data saved with {len(final_data)} records") |
97 |
| -date_min = final_data['time_d'].min().strftime('%Y-%m-%d') |
98 |
| -date_max = final_data['time_d'].max().strftime('%Y-%m-%d') |
99 |
| -print(f"Date range: {date_min} to {date_max}") |
100 |
| - |
101 |
| -print("\nSample data:") |
102 |
| -print(final_data.head()) |
103 |
| - |
104 |
| -print("\nVIX summary:") |
105 |
| -print(f"Total records: {len(final_data)}") |
106 |
| -print(f"Missing VIX values: {final_data['vix'].isna().sum()}") |
107 |
| -print(f"Missing dVIX values: {final_data['dVIX'].isna().sum()}") |
108 |
| -print(f"Mean: {final_data['vix'].mean():.2f}") |
109 |
| -print(f"Std: {final_data['vix'].std():.2f}") |
110 |
| -print(f"Min: {final_data['vix'].min():.2f}") |
111 |
| -print(f"Max: {final_data['vix'].max():.2f}") |
| 41 | + observations = response.json()["observations"] |
| 42 | + |
| 43 | + # Build DataFrame with parsed dates and numeric values for the requested series |
| 44 | + df = pd.DataFrame(observations) |
| 45 | + df["date"] = pd.to_datetime(df["date"]) |
| 46 | + df["value"] = pd.to_numeric(df["value"], errors="coerce") |
| 47 | + return df.rename(columns={"value": series_id})[["date", series_id]] |
| 48 | + |
| 49 | + |
| 50 | +print("Downloading VIX data from FRED...", flush=True) |
| 51 | +api_key = os.getenv("FRED_API_KEY") |
| 52 | + |
| 53 | +if not api_key: |
| 54 | + # Fail fast so the user knows credentials are missing before making requests |
| 55 | + raise ValueError("FRED_API_KEY not found in environment variables") |
| 56 | + |
| 57 | +vxocls = download_fred_series("VXOCLS", api_key) |
| 58 | +vixcls = download_fred_series("VIXCLS", api_key) |
| 59 | + |
| 60 | +print(f"Downloaded {len(vxocls)} VXO observations and {len(vixcls)} VIX observations.", flush=True) |
| 61 | + |
| 62 | +cutoff = pd.Timestamp("2021-09-23") |
| 63 | +# Merge both series and take VXO up to the cutoff, VIX afterwards to build a continuous history |
| 64 | +vix_data = vxocls.merge(vixcls, on="date", how="outer").sort_values("date") |
| 65 | + |
| 66 | +vix_data["vix"] = vix_data["VXOCLS"] |
| 67 | +fill_mask = (vix_data["date"] >= cutoff) & vix_data["VXOCLS"].isna() |
| 68 | +vix_data.loc[fill_mask, "vix"] = vix_data.loc[fill_mask, "VIXCLS"] |
| 69 | + |
| 70 | +# Compute daily changes for the blended series and persist to parquet |
| 71 | +final_data = vix_data[["date", "vix"]].rename(columns={"date": "time_d"}) |
| 72 | +final_data["vix"] = final_data["vix"].astype("float32") |
| 73 | +final_data["dVIX"] = final_data["vix"].diff().astype("float32") |
| 74 | +final_data.to_parquet(OUTPUT_PATH) |
| 75 | + |
| 76 | +date_min = final_data["time_d"].min().date() |
| 77 | +date_max = final_data["time_d"].max().date() |
| 78 | +print(f"Saved {len(final_data)} rows to {OUTPUT_PATH}", flush=True) |
| 79 | +print(f"Date range: {date_min} to {date_max}", flush=True) |
| 80 | +print("=" * 60, flush=True) |
| 81 | +print("VIX.py completed successfully", flush=True) |
| 82 | +print("=" * 60, flush=True) |
0 commit comments