Source code for scr_financial.data.collectors.market_collector

"""
Market data collector for financial network analysis.

This module provides the MarketDataCollector class for retrieving market data
such as CDS spreads, equity prices, and other market indicators.
"""

import logging
import pandas as pd
import numpy as np
from typing import List, Optional

logger = logging.getLogger(__name__)


def _is_stress_period(date: pd.Timestamp, year_month: str) -> bool:
    """Check whether a timestamp falls in a given year-month stress period.

    Args:
        date: The timestamp to test.
        year_month: A string of the form 'YYYY-MM'.

    Returns:
        True if the timestamp's year and month match *year_month*.
    """
    return date.strftime("%Y-%m") == year_month


[docs] class MarketDataCollector: """ Collector for market data. This class provides methods for retrieving market data such as CDS spreads, equity prices, SRISK, and other market indicators from various sources. """
[docs] def __init__(self) -> None: """Initialize the market data collector.""" self.fred_base_url = "https://api.stlouisfed.org/fred/series/observations" self.vlab_base_url = "https://vlab.stern.nyu.edu/api/v2" self.bis_base_url = "https://www.bis.org/statistics/api/data" self._rng = np.random.default_rng(seed=42)
[docs] def __repr__(self) -> str: """Return a string representation of this collector. Returns: str: Class name and FRED base URL. """ return f"{self.__class__.__name__}(fred_base_url={self.fred_base_url!r})"
[docs] def collect_cds_data( self, start_date: str, end_date: str, bank_list: Optional[List[str]] = None, ) -> pd.DataFrame: """Collect CDS spread data. Args: start_date: Start date for data collection in 'YYYY-MM-DD' format. end_date: End date for data collection in 'YYYY-MM-DD' format. bank_list: List of bank identifiers to include. Returns: DataFrame containing CDS spreads. """ # TODO: replace with real EBA API call when credentials are available # For now, use sample data return self._get_sample_cds_data(start_date, end_date, bank_list)
[docs] def collect_srisk_data( self, start_date: str, end_date: str, bank_list: Optional[List[str]] = None, ) -> pd.DataFrame: """Collect SRISK data from NYU V-Lab. Args: start_date: Start date for data collection in 'YYYY-MM-DD' format. end_date: End date for data collection in 'YYYY-MM-DD' format. bank_list: List of bank identifiers to include. Returns: DataFrame containing SRISK values. """ # TODO: replace with real EBA API call when credentials are available # For now, use sample data return self._get_sample_srisk_data(start_date, end_date, bank_list)
[docs] def collect_equity_data( self, start_date: str, end_date: str, bank_list: Optional[List[str]] = None, ) -> pd.DataFrame: """Collect equity price data. Args: start_date: Start date for data collection in 'YYYY-MM-DD' format. end_date: End date for data collection in 'YYYY-MM-DD' format. bank_list: List of bank identifiers to include. Returns: DataFrame containing equity prices. """ # TODO: replace with real EBA API call when credentials are available # For now, use sample data return self._get_sample_equity_data(start_date, end_date, bank_list)
[docs] def collect_funding_stress_data( self, start_date: str, end_date: str ) -> pd.DataFrame: """Collect funding stress indicators. Args: start_date: Start date for data collection in 'YYYY-MM-DD' format. end_date: End date for data collection in 'YYYY-MM-DD' format. Returns: DataFrame containing funding stress indicators. """ # TODO: replace with real EBA API call when credentials are available # For now, use sample data return self._get_sample_funding_stress_data(start_date, end_date)
[docs] def collect_bis_data( self, start_date: str, end_date: str, indicator: str ) -> pd.DataFrame: """Collect data from Bank for International Settlements (BIS). Args: start_date: Start date for data collection in 'YYYY-MM-DD' format. end_date: End date for data collection in 'YYYY-MM-DD' format. indicator: Indicator to collect (e.g., 'credit_to_GDP_gap'). Returns: DataFrame containing BIS data. """ # TODO: replace with real EBA API call when credentials are available # For now, use sample data if indicator == "credit_to_GDP_gap": return self._get_sample_credit_gap_data(start_date, end_date) elif indicator == "house_price_gap": return self._get_sample_house_price_gap_data(start_date, end_date) else: raise ValueError(f"Unknown BIS indicator: {indicator}")
def _get_sample_cds_data( self, start_date: str, end_date: str, bank_list: Optional[List[str]] = None, ) -> pd.DataFrame: """Generate sample CDS data for testing. Args: start_date: Start date for data collection in 'YYYY-MM-DD' format. end_date: End date for data collection in 'YYYY-MM-DD' format. bank_list: List of bank identifiers to include. Returns: DataFrame containing sample CDS spreads. """ # Default bank list if none provided if bank_list is None: bank_list = [ "DE_DBK", "FR_BNP", "ES_SAN", "IT_UCG", "NL_ING", "SE_NDA", "CH_UBS", "UK_BARC", "UK_HSBC", "FR_ACA", ] # Generate dates within the range (daily) date_range = pd.date_range( start=start_date, end=end_date, freq="B" ) # Business days # Create sample data data = [] # Base CDS spreads for each bank base_cds = { "DE_DBK": 85, # Deutsche Bank "FR_BNP": 70, # BNP Paribas "ES_SAN": 90, # Santander "IT_UCG": 100, # UniCredit "NL_ING": 65, # ING "SE_NDA": 55, # Nordea "CH_UBS": 60, # UBS "UK_BARC": 80, # Barclays "UK_HSBC": 65, # HSBC "FR_ACA": 75, # Credit Agricole } for date in date_range: # Add some stress events if _is_stress_period(date, "2008-09") or _is_stress_period( date, "2008-10" ): # Financial crisis stress_factor = 5.0 elif _is_stress_period(date, "2011-08") or _is_stress_period( date, "2011-09" ): # European debt crisis stress_factor = 3.0 elif _is_stress_period(date, "2020-03") or _is_stress_period( date, "2020-04" ): # COVID-19 stress_factor = 4.0 else: stress_factor = 1.0 for bank_id in bank_list: # Get base CDS base = base_cds.get(bank_id, 75) # Add noise and stress factor cds = base * stress_factor * (1 + self._rng.normal(0, 0.05)) data.append({"date": date, "bank_id": bank_id, "CDS_5yr": round(cds, 1)}) return pd.DataFrame(data) def _get_sample_srisk_data( self, start_date: str, end_date: str, bank_list: Optional[List[str]] = None, ) -> pd.DataFrame: """Generate sample SRISK data for testing. Args: start_date: Start date for data collection in 'YYYY-MM-DD' format. end_date: End date for data collection in 'YYYY-MM-DD' format. bank_list: List of bank identifiers to include. Returns: DataFrame containing sample SRISK values. """ # Default bank list if none provided if bank_list is None: bank_list = [ "DE_DBK", "FR_BNP", "ES_SAN", "IT_UCG", "NL_ING", "SE_NDA", "CH_UBS", "UK_BARC", "UK_HSBC", "FR_ACA", ] # Generate dates within the range (weekly) date_range = pd.date_range(start=start_date, end=end_date, freq="W") # Create sample data data = [] # Base SRISK for each bank (in billions EUR) base_srisk = { "DE_DBK": 35, # Deutsche Bank "FR_BNP": 40, # BNP Paribas "ES_SAN": 30, # Santander "IT_UCG": 28, # UniCredit "NL_ING": 22, # ING "SE_NDA": 15, # Nordea "CH_UBS": 25, # UBS "UK_BARC": 32, # Barclays "UK_HSBC": 38, # HSBC "FR_ACA": 27, # Credit Agricole } for date in date_range: # Add some stress events if _is_stress_period(date, "2008-09") or _is_stress_period( date, "2008-10" ): # Financial crisis stress_factor = 2.5 elif _is_stress_period(date, "2011-08") or _is_stress_period( date, "2011-09" ): # European debt crisis stress_factor = 1.8 elif _is_stress_period(date, "2020-03") or _is_stress_period( date, "2020-04" ): # COVID-19 stress_factor = 2.0 else: stress_factor = 1.0 for bank_id in bank_list: # Get base SRISK base = base_srisk.get(bank_id, 25) # Add noise and stress factor srisk = base * stress_factor * (1 + self._rng.normal(0, 0.08)) # Convert to billions EUR srisk = srisk * 1e9 data.append( {"date": date, "bank_id": bank_id, "SRISK": round(srisk, 0)} ) return pd.DataFrame(data) def _get_sample_equity_data( self, start_date: str, end_date: str, bank_list: Optional[List[str]] = None, ) -> pd.DataFrame: """Generate sample equity price data for testing. Args: start_date: Start date for data collection in 'YYYY-MM-DD' format. end_date: End date for data collection in 'YYYY-MM-DD' format. bank_list: List of bank identifiers to include. Returns: DataFrame containing sample equity prices. """ # Default bank list if none provided if bank_list is None: bank_list = [ "DE_DBK", "FR_BNP", "ES_SAN", "IT_UCG", "NL_ING", "SE_NDA", "CH_UBS", "UK_BARC", "UK_HSBC", "FR_ACA", ] # Generate dates within the range (daily) date_range = pd.date_range( start=start_date, end=end_date, freq="B" ) # Business days # Create sample data data = [] # Base equity prices for each bank (in EUR) base_price = { "DE_DBK": 12.5, # Deutsche Bank "FR_BNP": 55.0, # BNP Paribas "ES_SAN": 3.8, # Santander "IT_UCG": 15.2, # UniCredit "NL_ING": 12.8, # ING "SE_NDA": 10.5, # Nordea "CH_UBS": 18.2, # UBS "UK_BARC": 2.1, # Barclays "UK_HSBC": 6.5, # HSBC "FR_ACA": 11.8, # Credit Agricole } # Generate price series with random walk for bank_id in bank_list: # Get base price price = base_price.get(bank_id, 10.0) # Generate price series with random walk prices = [price] for i in range(1, len(date_range)): # Add some stress events if _is_stress_period(date_range[i], "2008-09") or _is_stress_period( date_range[i], "2008-10" ): # Financial crisis drift = -0.01 volatility = 0.03 elif _is_stress_period( date_range[i], "2011-08" ) or _is_stress_period( date_range[i], "2011-09" ): # European debt crisis drift = -0.005 volatility = 0.025 elif _is_stress_period( date_range[i], "2020-03" ) or _is_stress_period( date_range[i], "2020-04" ): # COVID-19 drift = -0.015 volatility = 0.035 else: drift = 0.0002 volatility = 0.015 # Random walk with drift change = drift + self._rng.normal(0, volatility) new_price = prices[-1] * (1 + change) # Ensure price doesn't go below 0.1 prices.append(max(0.1, new_price)) # Add to data for i, date in enumerate(date_range): data.append( { "date": date, "bank_id": bank_id, "price": round(prices[i], 2), "volume": int(self._rng.uniform(1_000_000, 10_000_000)), } ) return pd.DataFrame(data) def _get_sample_funding_stress_data( self, start_date: str, end_date: str ) -> pd.DataFrame: """Generate sample funding stress indicators for testing. Args: start_date: Start date for data collection in 'YYYY-MM-DD' format. end_date: End date for data collection in 'YYYY-MM-DD' format. Returns: DataFrame containing sample funding stress indicators. """ # Generate dates within the range (daily) date_range = pd.date_range( start=start_date, end=end_date, freq="B" ) # Business days # Create sample data data = [] # Base values base_libor_ois = 0.15 # LIBOR-OIS spread in percentage points base_gc_tbill = 0.08 # GC Repo - T-Bill spread in percentage points for date in date_range: # Add some stress events if _is_stress_period(date, "2008-09") or _is_stress_period( date, "2008-10" ): # Financial crisis stress_factor = 5.0 elif _is_stress_period(date, "2011-08") or _is_stress_period( date, "2011-09" ): # European debt crisis stress_factor = 3.0 elif _is_stress_period(date, "2020-03") or _is_stress_period( date, "2020-04" ): # COVID-19 stress_factor = 4.0 else: stress_factor = 1.0 # Add noise and stress factor libor_ois = base_libor_ois * stress_factor * ( 1 + self._rng.normal(0, 0.1) ) gc_tbill = base_gc_tbill * stress_factor * ( 1 + self._rng.normal(0, 0.12) ) data.append( { "date": date, "LIBOR_OIS_spread": round(libor_ois, 3), "GC_Repo_TBill_spread": round(gc_tbill, 3), } ) return pd.DataFrame(data) def _get_sample_credit_gap_data( self, start_date: str, end_date: str ) -> pd.DataFrame: """Generate sample credit-to-GDP gap data for testing. Args: start_date: Start date for data collection in 'YYYY-MM-DD' format. end_date: End date for data collection in 'YYYY-MM-DD' format. Returns: DataFrame containing sample credit-to-GDP gap data. """ # Generate dates within the range (quarterly) date_range = pd.date_range(start=start_date, end=end_date, freq="Q") # Create sample data data = [] # Base values for different countries base_gaps = { "USA": 2.5, "EUR": 3.2, "GBR": 1.8, "CHE": 4.5, "JPN": -0.5, } for date in date_range: # Time factor for trend time_factor = (date - pd.to_datetime(start_date)).days / 365 for country, base_gap in base_gaps.items(): # Add trend and noise if date.year in (2008, 2009): # Financial crisis gap = ( base_gap + 5.0 + 0.5 * time_factor + self._rng.normal(0, 1.0) ) elif date.year in (2011, 2012): # European debt crisis gap = ( base_gap + 3.0 + 0.5 * time_factor + self._rng.normal(0, 0.8) ) else: gap = base_gap + 0.5 * time_factor + self._rng.normal(0, 0.5) data.append( { "date": date, "country": country, "credit_to_GDP_gap": round(gap, 2), } ) return pd.DataFrame(data) def _get_sample_house_price_gap_data( self, start_date: str, end_date: str ) -> pd.DataFrame: """Generate sample house price gap data for testing. Args: start_date: Start date for data collection in 'YYYY-MM-DD' format. end_date: End date for data collection in 'YYYY-MM-DD' format. Returns: DataFrame containing sample house price gap data. """ # Generate dates within the range (quarterly) date_range = pd.date_range(start=start_date, end=end_date, freq="Q") # Create sample data data = [] # Base values for different countries base_gaps = { "USA": 3.5, "EUR": 2.8, "GBR": 4.2, "CHE": 5.5, "JPN": -1.5, } for date in date_range: # Time factor for trend time_factor = (date - pd.to_datetime(start_date)).days / 365 for country, base_gap in base_gaps.items(): # Add trend and noise if date.year in (2008, 2009): # Financial crisis gap = ( base_gap - 4.0 + 0.3 * time_factor + self._rng.normal(0, 1.2) ) elif date.year in (2011, 2012): # European debt crisis gap = ( base_gap - 2.0 + 0.3 * time_factor + self._rng.normal(0, 1.0) ) else: gap = base_gap + 0.3 * time_factor + self._rng.normal(0, 0.7) data.append( { "date": date, "country": country, "house_price_gap": round(gap, 2), } ) return pd.DataFrame(data)