#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 为AU2512期货数据添加买挂合计和卖挂合计列 这个脚本读取原始数据文件,计算买方和卖方的挂单总量, 并添加两列:买挂合计(买1量至买5量之和)和卖挂合计(卖1量至卖5量之和) 使用方法: python add_order_sum_columns.py """ import pandas as pd import numpy as np import os from datetime import datetime def add_order_sum_columns(): """添加买挂合计和卖挂合计列到数据文件""" # 文件路径 input_file = 'data/au2512_20251013.parquet' backup_file = 'data/au2512_20251013_order_backup.parquet' output_file = 'data/au2512_20251013.parquet' print("开始为AU2512期货数据添加买挂合计和卖挂合计列...") # 检查输入文件是否存在 if not os.path.exists(input_file): print(f"错误: 输入文件不存在 - {input_file}") return False try: # 读取数据 print(f"正在读取数据文件: {input_file}") df = pd.read_parquet(input_file) print(f"数据加载成功: {len(df):,} 条记录") # 显示原始列名 print(f"\n原始列名 ({len(df.columns)} 列):") for i, col in enumerate(df.columns): print(f" {i:2d}: {col}") # 找到买量和卖量的列 buy_volume_cols = [] sell_volume_cols = [] for col in df.columns: if '买' in col and '量' in col: buy_volume_cols.append(col) elif '卖' in col and '量' in col: sell_volume_cols.append(col) print(f"\n找到的买量列: {buy_volume_cols}") print(f"找到的卖量列: {sell_volume_cols}") if len(buy_volume_cols) < 5 or len(sell_volume_cols) < 5: print("警告: 找到的买量或卖量列不足5个") # 计算买挂合计(买1量至买5量之和) print(f"\n计算买挂合计...") df['买挂合计'] = df[buy_volume_cols[:5]].sum(axis=1) # 计算卖挂合计(卖1量至卖5量之和) print(f"计算卖挂合计...") df['卖挂合计'] = df[sell_volume_cols[:5]].sum(axis=1) # 显示添加的列信息 print(f"\n添加的列信息:") print(f"买挂合计: 最小值 {df['买挂合计'].min()}, 最大值 {df['买挂合计'].max()}, 平均值 {df['买挂合计'].mean():.1f}") print(f"卖挂合计: 最小值 {df['卖挂合计'].min()}, 最大值 {df['卖挂合计'].max()}, 平均值 {df['卖挂合计'].mean():.1f}") # 备份原始文件 if not os.path.exists(backup_file): print(f"\n备份原始文件到: {backup_file}") df_original = pd.read_parquet(input_file) df_original.to_parquet(backup_file, index=False) print("原始文件备份完成") else: print(f"\n备份文件已存在: {backup_file},跳过备份") # 保存增强后的数据 print(f"\n保存增强后的数据到: {output_file}") df.to_parquet(output_file, index=False) # 显示最终列信息 print(f"\n最终数据列信息 ({len(df.columns)} 列):") for i, col in enumerate(df.columns): print(f" {i:2d}: {col}") print(f"\n数据增强完成!") print(f"新增列:") print(f" - 买挂合计: 买1量至买5量之和") print(f" - 卖挂合计: 卖1量至卖5量之和") # 显示一些统计信息 print(f"\n统计信息:") print(f"买挂合计 > 0 的记录: {(df['买挂合计'] > 0).sum():,} ({(df['买挂合计'] > 0).mean()*100:.1f}%)") print(f"卖挂合计 > 0 的记录: {(df['卖挂合计'] > 0).sum():,} ({(df['卖挂合计'] > 0).mean()*100:.1f}%)") print(f"买挂合计最大值: {df['买挂合计'].max():,} 手") print(f"卖挂合计最大值: {df['卖挂合计'].max():,} 手") print(f"买挂合计平均值: {df['买挂合计'].mean():.1f} 手") print(f"卖挂合计平均值: {df['卖挂合计'].mean():.1f} 手") return True except Exception as e: print(f"处理过程中发生错误: {e}") return False def verify_columns(): """验证添加的列是否正确""" print("\n" + "="*60) print("验证添加的列") print("="*60) try: # 读取增强后的数据 df = pd.read_parquet('data/au2512_20251013.parquet') if '买挂合计' in df.columns and '卖挂合计' in df.columns: print("[OK] 买挂合计和卖挂合计列已成功添加") # 显示前5行的相关列 buy_cols = [col for col in df.columns if '买' in col and '量' in col][:5] sell_cols = [col for col in df.columns if '卖' in col and '量' in col][:5] print(f"\n前5行数据验证:") print("买量列 + 买挂合计:") for i in range(min(5, len(df))): buy_sum = df[buy_cols].iloc[i].sum() buy_total = df['买挂合计'].iloc[i] match = "[OK]" if abs(buy_sum - buy_total) < 0.001 else "[ERROR]" print(f" 行{i+1}: 买量列求和={buy_sum}, 买挂合计={buy_total} {match}") print("卖量列 + 卖挂合计:") for i in range(min(5, len(df))): sell_sum = df[sell_cols].iloc[i].sum() sell_total = df['卖挂合计'].iloc[i] match = "[OK]" if abs(sell_sum - sell_total) < 0.001 else "[ERROR]" print(f" 行{i+1}: 卖量列求和={sell_sum}, 卖挂合计={sell_total} {match}") else: print("[ERROR] 买挂合计或卖挂合计列未找到") except Exception as e: print(f"验证过程中发生错误: {e}") if __name__ == "__main__": success = add_order_sum_columns() if success: verify_columns() print(f"\n[OK] 数据增强完成!") else: print(f"\n[ERROR] 数据增强失败!")