!pip install kagglehub
!pip install tqdm

import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

import os
from tqdm import tqdm

import kagglehub

# Download latest version
path = kagglehub.dataset_download("paultimothymooney/stock-market-data")

print("Path to dataset files:", path)

Path to dataset files: /home/sungu/.cache/kagglehub/datasets/paultimothymooney/stock-market-data/versions/74

BASE_DIR = os.path.join(path, 'stock_market_data')

#lists = os.listdir(BASE_DIR)
lists = ['nasdaq', 'nyse', 'sp500']
ticker_to_path = {}

for list_name in lists:
    csv_path = os.path.join(BASE_DIR, list_name, 'csv')
    for file_name in os.listdir(csv_path):
        ticker = file_name[:file_name.index('.')]
        ticker_to_path[ticker] = os.path.join(csv_path, file_name)

ticker_to_close_s = {}

for ticker, path in tqdm(ticker_to_path.items()):
    single_stock_df = pd.read_csv(path)
    close_s = single_stock_df['Close']
    close_s.index = pd.to_datetime(single_stock_df['Date'], format='%d-%m-%Y', errors='coerce')
    ticker_to_close_s[ticker] = close_s

100%|██████████████████████████████████████| 2824/2824 [00:22<00:00, 123.89it/s]

stock_close_df = pd.DataFrame(ticker_to_close_s)

stock_changepct_df = stock_close_df.pct_change()

/tmp/ipykernel_43636/3417757284.py:1: FutureWarning: The default fill_method='pad' in DataFrame.pct_change is deprecated and will be removed in a future version. Either fill in any non-leading NA values prior to calling pct_change or specify 'fill_method=None' to not fill NA values.
  stock_changepct_df = stock_close_df.pct_change()

# 내 포트폴리오의 자산 비율 입력
# @TODO: 최근 등록 종목의 NaN전염 방지
portfolio = {
    'AAPL': 0.25,
    'AMZN': 0.25,
    'MSFT': 0.25,
    'NVDA': 0.25
}

portfolio_s = pd.Series(0, index=stock_close_df.index)
for ticker, weight in portfolio.items():
    if ticker in stock_close_df.columns:
        portfolio_s += weight * stock_changepct_df[ticker]
    else:
        print(ticker, '를 찾지 못했습니다.')

stock_changepct_df['MYPORTFOLIO'] = portfolio_s

backtesting_start_day = pd.to_datetime('03-01-2021', format='%d-%m-%Y', errors='coerce')
changepct_stock_df = stock_changepct_df.loc[backtesting_start_day:].T
changepct_stock_df.dropna(axis=0, inplace=True)
changepct_stock_df = changepct_stock_df.loc[(changepct_stock_df == 0).mean(axis=1) <= 0.4, :]

num_neighbors = 10
stock_based_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=num_neighbors)
stock_based_model.fit(changepct_stock_df)

NearestNeighbors(algorithm='brute', metric='cosine', n_neighbors=10)

NearestNeighbors(algorithm='brute', metric='cosine', n_neighbors=10)

distances, indices = stock_based_model.kneighbors(changepct_stock_df.loc['MYPORTFOLIO',:].to_numpy().reshape(1, -1))

plt.figure(figsize=(10, 5)) # 그래프 창 생성 및 크기 지정

for stock_ticker in changepct_stock_df.index[indices.flatten()[1:]]:
    start_price = stock_close_df.loc[backtesting_start_day:, stock_ticker][0]
    
    plt.plot(stock_close_df[backtesting_start_day:].index,
             stock_close_df.loc[backtesting_start_day:, stock_ticker]/start_price,
             label=stock_ticker)

plt.title('Stock close price')
plt.xlabel('Datetime')
plt.ylabel('Close price')
plt.legend()
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

/tmp/ipykernel_43636/4107691151.py:4: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  start_price = stock_close_df.loc[backtesting_start_day:, stock_ticker][0]

distances, indices = stock_based_model.kneighbors(-changepct_stock_df.loc['MYPORTFOLIO',:].to_numpy().reshape(1, -1))

plt.figure(figsize=(10, 5)) # 그래프 창 생성 및 크기 지정

for stock_ticker in changepct_stock_df.index[indices.flatten()[1:]]:
    start_price = stock_close_df.loc[backtesting_start_day:, stock_ticker][0]
    
    plt.plot(stock_close_df[backtesting_start_day:].index,
             stock_close_df.loc[backtesting_start_day:, stock_ticker]/start_price,
             label=stock_ticker)

plt.title('Stock close price')
plt.xlabel('Datetime')
plt.ylabel('Close price')
plt.legend()
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

/tmp/ipykernel_43636/4107691151.py:4: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  start_price = stock_close_df.loc[backtesting_start_day:, stock_ticker][0]

데이터 전처리¶

로직¶

내 포트폴리오와 비슷하게 움직이는 주식¶

내 포트폴리오와 반대로 움직이는 주식¶