2025-04-02 13:52:34 +08:00
|
|
|
# coding:utf-8
|
|
|
|
|
|
|
|
import requests
|
|
|
|
import pandas as pd
|
|
|
|
from sqlalchemy import create_engine, text
|
|
|
|
from datetime import datetime
|
|
|
|
from tqdm import tqdm
|
2025-05-19 17:02:52 +08:00
|
|
|
from src.scripts.config import XUEQIU_HEADERS
|
2025-05-14 08:54:56 +08:00
|
|
|
import gc
|
2025-04-02 13:52:34 +08:00
|
|
|
|
|
|
|
class StockDailyDataCollector:
|
|
|
|
"""股票日线数据采集器类"""
|
2025-05-14 08:54:56 +08:00
|
|
|
|
2025-04-02 13:52:34 +08:00
|
|
|
def __init__(self, db_url):
|
2025-05-14 08:54:56 +08:00
|
|
|
self.engine = create_engine(
|
|
|
|
db_url,
|
|
|
|
pool_size=5,
|
|
|
|
max_overflow=10,
|
|
|
|
pool_recycle=3600
|
|
|
|
)
|
2025-04-02 13:52:34 +08:00
|
|
|
self.headers = XUEQIU_HEADERS
|
|
|
|
|
|
|
|
def fetch_all_stock_codes(self):
|
2025-05-19 17:02:52 +08:00
|
|
|
# 从gp_code_all获取股票代码
|
|
|
|
query_all = "SELECT gp_code FROM gp_code_all"
|
|
|
|
df_all = pd.read_sql(query_all, self.engine)
|
|
|
|
codes_all = df_all['gp_code'].tolist()
|
|
|
|
|
|
|
|
# 从gp_code_zs获取股票代码
|
|
|
|
query_zs = "SELECT gp_code FROM gp_code_zs"
|
|
|
|
df_zs = pd.read_sql(query_zs, self.engine)
|
|
|
|
codes_zs = df_zs['gp_code'].tolist()
|
|
|
|
|
|
|
|
# 合并去重
|
|
|
|
all_codes = list(set(codes_all + codes_zs))
|
|
|
|
print(f"获取到股票代码: {len(codes_all)}个来自gp_code_all, {len(codes_zs)}个来自gp_code_zs, 去重后共{len(all_codes)}个")
|
|
|
|
return all_codes
|
2025-04-02 13:52:34 +08:00
|
|
|
|
|
|
|
def fetch_daily_stock_data(self, symbol, begin):
|
2025-05-14 08:54:56 +08:00
|
|
|
url = f"https://stock.xueqiu.com/v5/stock/chart/kline.json?symbol={symbol}&begin={begin}&period=day&type=before&count=-1&indicator=kline,pe,pb,ps,pcf,market_capital,agt,ggt,balance"
|
|
|
|
try:
|
|
|
|
response = requests.get(url, headers=self.headers, timeout=10)
|
|
|
|
return response.json()
|
|
|
|
except Exception as e:
|
|
|
|
print(f"Request error for {symbol}: {e}")
|
|
|
|
return {'error_code': -1, 'error_description': str(e)}
|
|
|
|
|
|
|
|
def transform_data(self, data, symbol):
|
2025-04-02 13:52:34 +08:00
|
|
|
try:
|
|
|
|
items = data['data']['item']
|
|
|
|
columns = data['data']['column']
|
|
|
|
except KeyError as e:
|
|
|
|
print(f"KeyError for {symbol}: {e}")
|
2025-05-14 08:54:56 +08:00
|
|
|
return None
|
2025-04-02 13:52:34 +08:00
|
|
|
|
|
|
|
df = pd.DataFrame(items, columns=columns)
|
|
|
|
df['symbol'] = symbol
|
2025-05-14 08:54:56 +08:00
|
|
|
|
|
|
|
required_columns = ['timestamp', 'volume', 'open', 'high', 'low', 'close',
|
|
|
|
'chg', 'percent', 'turnoverrate', 'amount', 'symbol', 'pb', 'pe', 'ps']
|
2025-04-02 13:52:34 +08:00
|
|
|
existing_columns = [col for col in required_columns if col in df.columns]
|
|
|
|
df = df[existing_columns]
|
2025-05-14 08:54:56 +08:00
|
|
|
|
2025-04-02 13:52:34 +08:00
|
|
|
if 'timestamp' in df.columns:
|
|
|
|
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms', utc=True).dt.tz_convert('Asia/Shanghai')
|
2025-05-14 08:54:56 +08:00
|
|
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
def save_batch_to_database(self, batch):
|
|
|
|
if batch:
|
|
|
|
df_all = pd.concat(batch, ignore_index=True)
|
|
|
|
df_all.to_sql('gp_day_data', self.engine, if_exists='append', index=False)
|
2025-04-02 13:52:34 +08:00
|
|
|
|
|
|
|
def fetch_data_for_date(self, date=None):
|
|
|
|
if date is None:
|
|
|
|
start_date = datetime.now()
|
|
|
|
date_str = start_date.strftime('%Y-%m-%d')
|
|
|
|
else:
|
|
|
|
start_date = datetime.strptime(date, '%Y-%m-%d')
|
|
|
|
date_str = date
|
|
|
|
|
2025-05-14 08:54:56 +08:00
|
|
|
delete_query = text("DELETE FROM gp_day_data WHERE `timestamp` LIKE :date_str")
|
|
|
|
with self.engine.begin() as conn:
|
2025-04-02 13:52:34 +08:00
|
|
|
conn.execute(delete_query, {"date_str": f"{date_str}%"})
|
2025-05-14 08:54:56 +08:00
|
|
|
|
2025-04-02 13:52:34 +08:00
|
|
|
stock_codes = self.fetch_all_stock_codes()
|
2025-05-14 08:54:56 +08:00
|
|
|
begin = int(start_date.replace(hour=0, minute=0, second=0, microsecond=0).timestamp() * 1000)
|
2025-04-02 13:52:34 +08:00
|
|
|
|
2025-05-14 08:54:56 +08:00
|
|
|
batch_data = []
|
|
|
|
for idx, symbol in enumerate(tqdm(stock_codes, desc=f"Fetching and saving daily stock data for {date_str}")):
|
2025-04-02 13:52:34 +08:00
|
|
|
data = self.fetch_daily_stock_data(symbol, begin)
|
2025-05-14 08:54:56 +08:00
|
|
|
|
|
|
|
if data.get('error_code') == 0:
|
|
|
|
df = self.transform_data(data, symbol)
|
|
|
|
if df is not None:
|
|
|
|
batch_data.append(df)
|
2025-04-02 13:52:34 +08:00
|
|
|
else:
|
2025-05-14 08:54:56 +08:00
|
|
|
print(f"Error fetching data for {symbol} on {date_str}: {data.get('error_description')}")
|
|
|
|
|
|
|
|
if len(batch_data) >= 100:
|
|
|
|
self.save_batch_to_database(batch_data)
|
|
|
|
batch_data.clear()
|
|
|
|
gc.collect()
|
|
|
|
|
|
|
|
# Save remaining data
|
|
|
|
if batch_data:
|
|
|
|
self.save_batch_to_database(batch_data)
|
|
|
|
gc.collect()
|
2025-04-02 13:52:34 +08:00
|
|
|
|
2025-05-14 08:54:56 +08:00
|
|
|
self.engine.dispose()
|
2025-04-02 13:52:34 +08:00
|
|
|
print(f"Daily data fetching and saving completed for {date_str}.")
|
|
|
|
|
|
|
|
def collect_stock_daily_data(db_url, date=None):
|
|
|
|
collector = StockDailyDataCollector(db_url)
|
|
|
|
collector.fetch_data_for_date(date)
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2025-04-02 16:16:35 +08:00
|
|
|
db_url = 'mysql+pymysql://root:Chlry#$.8@192.168.18.199:3306/db_gp_cj'
|
2025-04-02 13:52:34 +08:00
|
|
|
collect_stock_daily_data(db_url)
|