In [ ]:
!pip install kagglehub
!pip install tqdm
In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

import os
from tqdm import tqdm

데이터 전처리¶

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("paultimothymooney/stock-market-data")

print("Path to dataset files:", path)
Path to dataset files: /home/sungu/.cache/kagglehub/datasets/paultimothymooney/stock-market-data/versions/74
In [3]:
BASE_DIR = os.path.join(path, 'stock_market_data')

#lists = os.listdir(BASE_DIR)
lists = ['nasdaq', 'nyse', 'sp500']
ticker_to_path = {}

for list_name in lists:
    csv_path = os.path.join(BASE_DIR, list_name, 'csv')
    for file_name in os.listdir(csv_path):
        ticker = file_name[:file_name.index('.')]
        ticker_to_path[ticker] = os.path.join(csv_path, file_name)
In [4]:
ticker_to_close_s = {}

for ticker, path in tqdm(ticker_to_path.items()):
    single_stock_df = pd.read_csv(path)
    close_s = single_stock_df['Close']
    close_s.index = pd.to_datetime(single_stock_df['Date'], format='%d-%m-%Y', errors='coerce')
    ticker_to_close_s[ticker] = close_s
100%|██████████████████████████████████████| 2824/2824 [00:22<00:00, 123.89it/s]
In [5]:
stock_close_df = pd.DataFrame(ticker_to_close_s)

로직¶

In [6]:
stock_changepct_df = stock_close_df.pct_change()
/tmp/ipykernel_43636/3417757284.py:1: FutureWarning: The default fill_method='pad' in DataFrame.pct_change is deprecated and will be removed in a future version. Either fill in any non-leading NA values prior to calling pct_change or specify 'fill_method=None' to not fill NA values.
  stock_changepct_df = stock_close_df.pct_change()
In [7]:
# 내 포트폴리오의 자산 비율 입력
# @TODO: 최근 등록 종목의 NaN전염 방지
portfolio = {
    'AAPL': 0.25,
    'AMZN': 0.25,
    'MSFT': 0.25,
    'NVDA': 0.25
}

portfolio_s = pd.Series(0, index=stock_close_df.index)
for ticker, weight in portfolio.items():
    if ticker in stock_close_df.columns:
        portfolio_s += weight * stock_changepct_df[ticker]
    else:
        print(ticker, '를 찾지 못했습니다.')

stock_changepct_df['MYPORTFOLIO'] = portfolio_s
In [8]:
backtesting_start_day = pd.to_datetime('03-01-2021', format='%d-%m-%Y', errors='coerce')
changepct_stock_df = stock_changepct_df.loc[backtesting_start_day:].T
changepct_stock_df.dropna(axis=0, inplace=True)
changepct_stock_df = changepct_stock_df.loc[(changepct_stock_df == 0).mean(axis=1) <= 0.4, :]

num_neighbors = 10
stock_based_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=num_neighbors)
stock_based_model.fit(changepct_stock_df)
Out[8]:
NearestNeighbors(algorithm='brute', metric='cosine', n_neighbors=10)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
NearestNeighbors(algorithm='brute', metric='cosine', n_neighbors=10)

내 포트폴리오와 비슷하게 움직이는 주식¶

In [9]:
distances, indices = stock_based_model.kneighbors(changepct_stock_df.loc['MYPORTFOLIO',:].to_numpy().reshape(1, -1))
In [10]:
plt.figure(figsize=(10, 5)) # 그래프 창 생성 및 크기 지정

for stock_ticker in changepct_stock_df.index[indices.flatten()[1:]]:
    start_price = stock_close_df.loc[backtesting_start_day:, stock_ticker][0]
    
    plt.plot(stock_close_df[backtesting_start_day:].index,
             stock_close_df.loc[backtesting_start_day:, stock_ticker]/start_price,
             label=stock_ticker)

plt.title('Stock close price')
plt.xlabel('Datetime')
plt.ylabel('Close price')
plt.legend()
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()
/tmp/ipykernel_43636/4107691151.py:4: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  start_price = stock_close_df.loc[backtesting_start_day:, stock_ticker][0]
No description has been provided for this image

내 포트폴리오가 빅테크 4개를 0.25씩 보유한 포트폴리오다보니 나스닥 추종 ETF인 ONEQ같은것이 가장 연관이 높다고 나오는걸 볼 수 있다.

이를 통해 빅테크기업의 가격이 나스닥 지수에 큰 영향을 미친다는 추론도 가능할 것 같다.

내 포트폴리오와 반대로 움직이는 주식¶

In [11]:
distances, indices = stock_based_model.kneighbors(-changepct_stock_df.loc['MYPORTFOLIO',:].to_numpy().reshape(1, -1))
In [12]:
plt.figure(figsize=(10, 5)) # 그래프 창 생성 및 크기 지정

for stock_ticker in changepct_stock_df.index[indices.flatten()[1:]]:
    start_price = stock_close_df.loc[backtesting_start_day:, stock_ticker][0]
    
    plt.plot(stock_close_df[backtesting_start_day:].index,
             stock_close_df.loc[backtesting_start_day:, stock_ticker]/start_price,
             label=stock_ticker)

plt.title('Stock close price')
plt.xlabel('Datetime')
plt.ylabel('Close price')
plt.legend()
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()
/tmp/ipykernel_43636/4107691151.py:4: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  start_price = stock_close_df.loc[backtesting_start_day:, stock_ticker][0]
No description has been provided for this image

내 주식포트폴리오와 반대로 주식들로는 움직이는 KINS, UEEC 같은 보험주가 나왔다.

하락장에서는 보통 보험주들이 잘 버텨준다고 한다. 이런 보험주를 포트폴리오에 포함한다면 헷징 효과가 있을것으로 에상된다.