# coding:utf-8 import requests import pandas as pd from sqlalchemy import create_engine, text from datetime import datetime, timedelta from tqdm import tqdm from src.scripts.config import XUEQIU_HEADERS import gc class StockDailyDataCollector: """股票日线数据采集器类""" def __init__(self, db_url): self.engine = create_engine( db_url, pool_size=5, max_overflow=10, pool_recycle=3600 ) self.headers = XUEQIU_HEADERS def fetch_all_stock_codes(self): # 从gp_code_all获取股票代码 query_all = "SELECT gp_code FROM gp_code_all" df_all = pd.read_sql(query_all, self.engine) codes_all = df_all['gp_code'].tolist() # 从gp_code_zs获取股票代码 query_zs = "SELECT gp_code FROM gp_code_zs" df_zs = pd.read_sql(query_zs, self.engine) codes_zs = df_zs['gp_code'].tolist() # 从gp_code_hk获取股票代码 query_hk = "SELECT gp_code FROM gp_code_hk" df_hk = pd.read_sql(query_hk, self.engine) codes_hk = df_hk['gp_code'].tolist() # 合并去重 all_codes = list(set(codes_all + codes_zs + codes_hk)) print(f"获取到股票代码: {len(codes_all)}个来自gp_code_all, {len(codes_zs)}个来自gp_code_zs, {len(codes_hk)}个来自gp_code_hk, 去重后共{len(all_codes)}个") return all_codes def fetch_daily_stock_data(self, symbol, begin, count=-1): """获取日线数据,count=-1表示最新一天,-2表示最近两天,-1800表示最近1800天""" url = f"https://stock.xueqiu.com/v5/stock/chart/kline.json?symbol={symbol}&begin={begin}&period=day&type=before&count={count}&indicator=kline,pe,pb,ps,pcf,market_capital,agt,ggt,balance" try: response = requests.get(url, headers=self.headers, timeout=20) return response.json() except Exception as e: print(f"Request error for {symbol}: {e}") return {'error_code': -1, 'error_description': str(e)} def transform_data(self, data, symbol): try: items = data['data']['item'] columns = data['data']['column'] except KeyError as e: print(f"KeyError for {symbol}: {e}") return None df = pd.DataFrame(items, columns=columns) df['symbol'] = symbol required_columns = ['timestamp', 'volume', 'open', 'high', 'low', 'close', 'chg', 'percent', 'turnoverrate', 'amount', 'symbol', 'pb', 'pe', 'ps'] existing_columns = [col for col in required_columns if col in df.columns] df = df[existing_columns] if 'timestamp' in df.columns: df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms', utc=True).dt.tz_convert('Asia/Shanghai') return df def save_batch_to_database(self, batch): if batch: df_all = pd.concat(batch, ignore_index=True) df_all.to_sql('gp_day_data', self.engine, if_exists='append', index=False) def fetch_data_for_date(self, date=None): if date is None: start_date = datetime.now() date_str = start_date.strftime('%Y-%m-%d') else: start_date = datetime.strptime(date, '%Y-%m-%d') date_str = date delete_query = text("DELETE FROM gp_day_data WHERE `timestamp` LIKE :date_str") with self.engine.begin() as conn: conn.execute(delete_query, {"date_str": f"{date_str}%"}) stock_codes = self.fetch_all_stock_codes() begin = int(start_date.replace(hour=0, minute=0, second=0, microsecond=0).timestamp() * 1000) batch_data = [] for idx, symbol in enumerate(tqdm(stock_codes, desc=f"Fetching and saving daily stock data for {date_str}")): data = self.fetch_daily_stock_data(symbol, begin) if data.get('error_code') == 0: df = self.transform_data(data, symbol) if df is not None: batch_data.append(df) else: print(f"Error fetching data for {symbol} on {date_str}: {data.get('error_description')}") if len(batch_data) >= 100: self.save_batch_to_database(batch_data) batch_data.clear() gc.collect() # Save remaining data if batch_data: self.save_batch_to_database(batch_data) gc.collect() self.engine.dispose() print(f"Daily data fetching and saving completed for {date_str}.") def delete_stock_history(self, symbol): """删除指定股票的全部历史数据""" delete_query = text("DELETE FROM gp_day_data WHERE symbol = :symbol") try: with self.engine.begin() as conn: conn.execute(delete_query, {"symbol": symbol}) print(f"Deleted history for {symbol}") return True except Exception as e: print(f"Error deleting history for {symbol}: {e}") return False def refetch_and_save_history(self, symbol, days=1800): """重新获取并保存指定股票的长期历史数据""" print(f"Refetching last {days} days for {symbol}...") begin = int(datetime.now().timestamp() * 1000) data = self.fetch_daily_stock_data(symbol, begin, count=-days) if data.get('error_code') == 0: df = self.transform_data(data, symbol) if df is not None and not df.empty: self.save_batch_to_database([df]) print(f"Successfully refetched and saved history for {symbol}.") else: print(f"No data transformed for {symbol} after refetch.") else: print(f"Error refetching history for {symbol}: {data.get('error_description')}") def check_and_fix_ex_rights_data(self): """ 检查所有股票是否发生除权,如果发生,则删除历史数据并重新获取。 新逻辑:直接用API返回的上个交易日的时间戳去数据库查询,更稳妥。 记录:除权日期、股票代码()、 """ all_codes = self.fetch_all_stock_codes() ex_rights_log_data = [] print("--- Step 1: Checking for ex-rights stocks ---") for symbol in tqdm(all_codes, desc="Comparing prices"): # 1. 从API获取最近两天的日线数据 begin = int(datetime.now().timestamp() * 1000) data = self.fetch_daily_stock_data(symbol, begin, count=-2) api_timestamp_str = None api_close = None if data.get('error_code') == 0 and data.get('data', {}).get('item') and len(data['data']['item']) >= 2: try: # API返回的数据是按时间升序的,[-2]是上个交易日 prev_day_data = data['data']['item'][-2] columns = data['data']['column'] timestamp_index = columns.index('timestamp') close_index = columns.index('close') api_timestamp_ms = prev_day_data[timestamp_index] api_close = prev_day_data[close_index] # 将毫秒时间戳转换为'YYYY-MM-DD'格式,用于数据库查询 api_timestamp_str = pd.to_datetime(api_timestamp_ms, unit='ms', utc=True).tz_convert('Asia/Shanghai').strftime('%Y-%m-%d') except (ValueError, IndexError, TypeError) as e: print(f"\nError parsing API data for {symbol}: {e}") continue # 处理下一只股票 else: # 获取API数据失败或数据不足,跳过此股票 continue # 如果未能从API解析出上个交易日的数据,则跳过 if api_timestamp_str is None or api_close is None: continue # 2. 根据API返回的时间戳,从数据库查询当天的收盘价 db_close = None query = text("SELECT `close` FROM gp_day_data WHERE symbol = :symbol AND `timestamp` LIKE :date_str") try: with self.engine.connect() as conn: result = conn.execute(query, {"symbol": symbol, "date_str": f"{api_timestamp_str}%"}).fetchone() db_close = result[0] if result else None except Exception as e: print(f"\nError getting DB close for {symbol} on {api_timestamp_str}: {e}") continue # 3. 比较价格 if db_close is not None: # 注意:数据库中取出的db_close可能是Decimal类型,需要转换 if not abs(float(db_close) - api_close) < 0.001: print(f"\nEx-rights detected for {symbol} on {api_timestamp_str}: DB_close={db_close}, API_close={api_close}") ex_rights_log_data.append({ 'symbol': symbol, 'date': datetime.now().strftime('%Y-%m-%d'), 'db_price': float(db_close), 'api_price': api_close, 'log_time': datetime.now() }) # 如果数据库当天没有数据,我们无法比较,所以不处理。 # 这可能是新股或之前采集失败,不属于除权范畴。 # 4. 对发生除权的股票进行记录和修复 if not ex_rights_log_data: print("\n--- No ex-rights stocks found. Data is consistent. ---") self.engine.dispose() return # 在修复前,先将日志保存到数据库 self.save_ex_rights_log(ex_rights_log_data) # 从日志数据中提取出需要修复的股票代码列表 ex_rights_stocks = [item['symbol'] for item in ex_rights_log_data] print(f"\n--- Step 2: Found {len(ex_rights_stocks)} stocks to fix: {ex_rights_stocks} ---") for symbol in tqdm(ex_rights_stocks, desc="Fixing data"): if self.delete_stock_history(symbol): self.refetch_and_save_history(symbol, days=1800) self.engine.dispose() print("\n--- Ex-rights data fixing process completed. ---") def save_ex_rights_log(self, log_data: list): """将除权日志保存到数据库""" if not log_data: return print(f"--- Saving {len(log_data)} ex-rights events to log table... ---") try: df = pd.DataFrame(log_data) # 确保列名与数据库字段匹配 df = df.rename(columns={ 'symbol': 'stock_code', 'date': 'change_date', 'db_price': 'before_price', 'api_price': 'after_price', 'log_time': 'update_time' }) df.to_sql('gp_ex_rights_log', self.engine, if_exists='append', index=False) print("--- Ex-rights log saved successfully. ---") except Exception as e: print(f"!!! Error saving ex-rights log: {e}") def collect_stock_daily_data(db_url, date=None): collector = StockDailyDataCollector(db_url) collector.fetch_data_for_date(date) collector.check_and_fix_ex_rights_data() if __name__ == "__main__": db_url = 'mysql+pymysql://root:Chlry#$.8@192.168.18.199:3306/db_gp_cj' # --- 使用方式 --- # 1. 日常采集当天数据 # collect_stock_daily_data(db_url) # 2. 手动执行除权检查和数据修复 # collector = StockDailyDataCollector(db_url) # collector.check_and_fix_ex_rights_data()