import pandas as pd import numpy as np import matplotlib.pyplot as plt import matplotlib as mpl import os # 设置中文字体 plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans'] plt.rcParams['axes.unicode_minus'] = False def analyze_current_volume_optimized(): """分析当前成交量大于150的数据点后不同长度的成交价走势(优化版:信号抑制)""" print("正在读取数据文件...") # 读取数据(从上级目录的data文件夹) df = pd.read_parquet('../data/au2512_20251013.parquet') print(f"数据总行数: {len(df)}") print(f"数据列名: {df.columns.tolist()}") # 查找当前成交量列的实际名称 current_volume_col = None for col in df.columns: if '当前成交量' in str(col) or 'cur_volume' in str(col).lower() or '成交量' in str(col): if '累积' not in str(col): # 排除累积成交量 current_volume_col = col break if current_volume_col is None: print("未找到当前成交量列,尝试查找其他可能的成交量列...") # 如果没找到当前成交量,尝试其他可能的列名 for col in df.columns: if '量' in str(col) and '累积' not in str(col) and '买' not in str(col) and '卖' not in str(col): current_volume_col = col print(f"使用可能的成交量列: {col}") break if current_volume_col is None: print("未找到合适的成交量列") return print(f"使用当前成交量列: {current_volume_col}") # 获取成交价列名 price_col = None for col in df.columns: if '成交价' in str(col) or 'price' in str(col).lower(): price_col = col break if price_col is None: print("未找到成交价列") return print(f"使用成交价列: {price_col}") # 显示当前成交量的统计信息 print(f"\n当前成交量统计信息:") print(f"最小值: {df[current_volume_col].min()}") print(f"最大值: {df[current_volume_col].max()}") print(f"平均值: {df[current_volume_col].mean():.2f}") print(f"中位数: {df[current_volume_col].median():.2f}") # 筛选当前成交量大于150的数据点 large_volume_mask = df[current_volume_col] > 150 large_volume_indices = df[large_volume_mask].index.tolist() print(f"\n找到当前成交量大于150的数据点数量: {len(large_volume_indices)}") if len(large_volume_indices) > 0: large_volumes = df.loc[large_volume_indices, current_volume_col] print(f"大成交量统计: 最小={large_volumes.min():.0f}, 最大={large_volumes.max():.0f}, 平均={large_volumes.mean():.0f}") # 信号抑制逻辑:移除20个数据点内的重复信号 def apply_signal_suppression(indices, suppression_window=20): """应用信号抑制逻辑,移除指定窗口内的重复信号""" if not indices: return [] # 按索引排序 sorted_indices = sorted(indices) filtered_indices = [] suppressed_count = 0 for i, idx in enumerate(sorted_indices): # 检查是否与前面的有效信号距离太近 is_suppressed = False for prev_idx in filtered_indices: if idx - prev_idx <= suppression_window: is_suppressed = True suppressed_count += 1 break if not is_suppressed: filtered_indices.append(idx) return filtered_indices, suppressed_count # 应用信号抑制 print("\n应用信号抑制逻辑(20个数据点窗口)...") filtered_indices, suppressed_count = apply_signal_suppression(large_volume_indices, 20) print(f"原始信号数量: {len(large_volume_indices)}") print(f"抑制后信号数量: {len(filtered_indices)}") print(f"被抑制的信号数量: {suppressed_count}") print(f"抑制率: {suppressed_count/len(large_volume_indices)*100:.1f}%") if len(filtered_indices) > 0: filtered_volumes = df.loc[filtered_indices, current_volume_col] print(f"过滤后大成交量统计: 最小={filtered_volumes.min():.0f}, 最大={filtered_volumes.max():.0f}, 平均={filtered_volumes.mean():.0f}") # 提取价格序列的函数 def extract_price_sequences(indices, max_points): sequences = [] sequence_info = [] for idx in indices: remaining_points = len(df) - idx - 1 take_points = min(max_points, remaining_points) if take_points > 0: base_price = df.loc[idx, price_col] future_prices = df.loc[idx + 1: idx + take_points, price_col].values price_changes = future_prices - base_price sequences.append(price_changes) sequence_info.append({ 'start_index': idx, 'volume': df.loc[idx, current_volume_col], 'base_price': base_price, 'sequence_length': take_points }) return sequences, sequence_info # 分析不同时间长度的数据 analysis_lengths = [100, 200, 500] # 100, 200, 500个数据点 for length in analysis_lengths: print(f"\n{'='*60}") print(f"分析当前成交量>150后{length}个数据点的价格走势(优化版)") print(f"{'='*60}") # 提取过滤后大成交量的价格序列 volume_sequences, volume_info = extract_price_sequences(filtered_indices, length) print(f"成功提取 {len(volume_sequences)} 个过滤后大成交量价格序列 (最大长度: {length})") # 创建综合分析图表 fig, axes = plt.subplots(2, 2, figsize=(20, 16)) fig.suptitle(f'当前成交量>150的价格走势分析(优化版)(后{length}个数据点)\n原始信号:{len(large_volume_indices)}个 → 过滤后:{len(filtered_indices)}个 (抑制{suppressed_count}个)', fontsize=14, fontweight='bold') # 1. 过滤后大成交量价格变化图(所有序列) ax1 = axes[0, 0] if volume_sequences: # 使用渐变色 colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(volume_sequences))) for i, (sequence, info) in enumerate(zip(volume_sequences, volume_info)): x_axis = range(len(sequence)) ax1.plot(x_axis, sequence, color=colors[i], alpha=0.6, linewidth=0.8) ax1.axhline(y=0, color='red', linestyle='--', alpha=0.7, linewidth=1.5) ax1.set_xlabel('数据点序号') ax1.set_ylabel('相对价格变化') ax1.set_title(f'过滤后价格变化走势 (后{length}点)\n共{len(volume_sequences)}个有效序列') ax1.grid(True, alpha=0.3) # 添加信号抑制信息 suppression_text = f'原始: {len(large_volume_indices)}个\n过滤: {len(filtered_indices)}个\n抑制: {suppressed_count}个' ax1.text(0.02, 0.98, suppression_text, transform=ax1.transAxes, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='lightgreen', alpha=0.8)) # 2. 按成交量大小分组显示(前20% vs 后80%) ax2 = axes[0, 1] if len(volume_info) > 0: # 按成交量排序 sorted_indices = sorted(range(len(volume_info)), key=lambda i: volume_info[i]['volume'], reverse=True) top_20_percent = max(1, len(sorted_indices) // 5) # 至少1个 top_sequences = [volume_sequences[i] for i in sorted_indices[:top_20_percent]] bottom_sequences = [volume_sequences[i] for i in sorted_indices[top_20_percent:]] # 显示最大的20%成交量序列(红色) for i, sequence in enumerate(top_sequences): x_axis = range(len(sequence)) ax2.plot(x_axis, sequence, color='red', alpha=0.7, linewidth=1.2, label='最大20%成交量' if i == 0 else "") # 显示较小的80%成交量序列(蓝色) for i, sequence in enumerate(bottom_sequences): x_axis = range(len(sequence)) ax2.plot(x_axis, sequence, color='blue', alpha=0.4, linewidth=0.6, label='其他80%成交量' if i == 0 else "") ax2.axhline(y=0, color='red', linestyle='--', alpha=0.7, linewidth=1.5) ax2.set_xlabel('数据点序号') ax2.set_ylabel('相对价格变化') ax2.set_title(f'按成交量大小分组的价格走势 (后{length}点)\n红色:最大20%({len(top_sequences) if len(volume_info) > 0 else 0}个) 蓝色:其他80%({len(bottom_sequences) if len(volume_info) > 0 else 0}个)') ax2.grid(True, alpha=0.3) if len(volume_info) > 0: ax2.legend(fontsize=10) # 3. 平均变化和置信区间 ax3 = axes[1, 0] def calculate_avg_changes_and_std(sequences): if not sequences: return [], [] max_len = max(len(seq) for seq in sequences) avg_changes = [] std_changes = [] for i in range(max_len): point_changes = [seq[i] for seq in sequences if len(seq) > i] if point_changes: avg_changes.append(np.mean(point_changes)) std_changes.append(np.std(point_changes)) return avg_changes, std_changes avg_changes, std_changes = calculate_avg_changes_and_std(volume_sequences) if avg_changes: x_axis = range(len(avg_changes)) ax3.plot(x_axis, avg_changes, color='green', linewidth=2.5, label=f'平均变化 (n={len(volume_sequences)})') # 添加置信区间(±1个标准差) upper_bound = [avg + std for avg, std in zip(avg_changes, std_changes)] lower_bound = [avg - std for avg, std in zip(avg_changes, std_changes)] ax3.fill_between(x_axis, lower_bound, upper_bound, alpha=0.3, color='green', label='±1标准差区间') ax3.axhline(y=0, color='black', linestyle='--', alpha=0.7, linewidth=1.5) ax3.set_xlabel('数据点序号') ax3.set_ylabel('平均相对价格变化') ax3.set_title(f'平均价格变化及置信区间 (后{length}点)') ax3.legend(fontsize=12) ax3.grid(True, alpha=0.3) # 4. 统计信息文本框 ax4 = axes[1, 1] ax4.axis('off') # 计算统计信息 def calculate_stats(sequences): if not sequences: return {} final_changes = [seq[-1] for seq in sequences if len(seq) > 0] if final_changes: return { 'count': len(sequences), 'avg_final_change': np.mean(final_changes), 'std_final_change': np.std(final_changes), 'max_rise': np.max(final_changes), 'max_fall': np.min(final_changes), 'positive_ratio': sum(1 for change in final_changes if change > 0) / len(final_changes), 'avg_max_gain': np.mean([np.max(seq) for seq in sequences if len(seq) > 0]), 'avg_max_loss': np.mean([np.min(seq) for seq in sequences if len(seq) > 0]) } return {} volume_stats = calculate_stats(volume_sequences) # 显示统计信息 stats_text = f"=== 当前成交量>150 统计信息 (后{length}点) ===\n" stats_text += f"信号抑制效果:\n" stats_text += f" 原始信号: {len(large_volume_indices)}个\n" stats_text += f" 过滤信号: {len(filtered_indices)}个\n" stats_text += f" 抑制数量: {suppressed_count}个\n" stats_text += f" 抑制率: {suppressed_count/len(large_volume_indices)*100:.1f}%\n\n" if volume_stats: stats_text += f"价格统计:\n" stats_text += f" 序列数量: {volume_stats['count']}\n" stats_text += f" 平均最终变化: {volume_stats['avg_final_change']:.4f}\n" stats_text += f" 变化标准差: {volume_stats['std_final_change']:.4f}\n" stats_text += f" 最大上涨: {volume_stats['max_rise']:.4f}\n" stats_text += f" 最大下跌: {volume_stats['max_fall']:.4f}\n" stats_text += f" 上涨比例: {volume_stats['positive_ratio']:.1%}\n" stats_text += f" 平均最大获利: {volume_stats['avg_max_gain']:.4f}\n" stats_text += f" 平均最大亏损: {volume_stats['avg_max_loss']:.4f}\n\n" # 添加关键时间点分析 if avg_changes: stats_text += f"=== 关键时间点分析 ===\n" if length >= 500: points_to_check = [9, 49, 199, 499] # 第10、50、200、500点 point_names = ['第10点', '第50点', '第200点', '第500点'] elif length >= 200: points_to_check = [9, 49, 199] # 第10、50、200点 point_names = ['第10点', '第50点', '第200点'] else: points_to_check = [9, 49] # 第10、50点 point_names = ['第10点', '第50点'] for i, point in enumerate(points_to_check): if point < len(avg_changes): stats_text += f" {point_names[i]}: {avg_changes[point]:.4f}" if point < len(std_changes): stats_text += f" (±{std_changes[point]:.4f})" stats_text += "\n" # 添加成交量信息 if len(volume_info) > 0: volumes = [info['volume'] for info in volume_info] stats_text += f"\n=== 成交量信息 ===\n" stats_text += f" 成交量范围: {min(volumes):.0f} - {max(volumes):.0f}\n" stats_text += f" 平均成交量: {np.mean(volumes):.0f}\n" stats_text += f" 成交量中位数: {np.median(volumes):.0f}" ax4.text(0.05, 0.95, stats_text, transform=ax4.transAxes, fontsize=10, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.8)) plt.tight_layout() # 保存综合图表 output_file = f'current_volume_optimized_comprehensive_analysis_{length}points.png' plt.savefig(output_file, dpi=300, bbox_inches='tight') print(f"\n{length}点优化版综合分析图表已保存为: {output_file}") print(f"完整路径: {os.path.abspath(output_file)}") # 保存单独的过滤后大成交量走势图 fig_volume, ax_volume = plt.subplots(figsize=(15, 10)) if volume_sequences: colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(volume_sequences))) for i, (sequence, info) in enumerate(zip(volume_sequences, volume_info)): x_axis = range(len(sequence)) ax_volume.plot(x_axis, sequence, color=colors[i], alpha=0.6, linewidth=1) ax_volume.axhline(y=0, color='red', linestyle='--', alpha=0.7, linewidth=1.5) ax_volume.set_xlabel(f'数据点序号 (相对于过滤后大成交量时刻, 后{length}点)', fontsize=12) ax_volume.set_ylabel('相对价格变化 (相对于基准点)', fontsize=12) ax_volume.set_title(f'当前成交量>150过滤后数据点{length}个相对价格变化走势\n信号抑制: {len(large_volume_indices)}→{len(filtered_indices)} (抑制{suppressed_count}个)\n共{len(volume_sequences)}个有效序列', fontsize=14, fontweight='bold') ax_volume.grid(True, alpha=0.3) plt.tight_layout() volume_output_file = f'current_volume_optimized_relative_price_changes_{length}points.png' plt.savefig(volume_output_file, dpi=300, bbox_inches='tight') print(f"过滤后大成交量{length}点分析图表已保存为: {volume_output_file}") plt.close() # 显示统计信息 print(f"\n{'='*50}") print(f"详细统计信息 (后{length}点) - 优化版:") print(f"{'='*50}") print(f"\n【信号抑制效果】") print(f"原始信号数量: {len(large_volume_indices)}") print(f"过滤后信号数量: {len(filtered_indices)}") print(f"被抑制信号数量: {suppressed_count}") print(f"信号抑制率: {suppressed_count/len(large_volume_indices)*100:.1f}%") if volume_stats: print(f"\n【过滤后价格统计】") print(f"序列数量: {volume_stats['count']}") print(f"平均最终变化: {volume_stats['avg_final_change']:.4f}") print(f"变化标准差: {volume_stats['std_final_change']:.4f}") print(f"最大上涨: {volume_stats['max_rise']:.4f}") print(f"最大下跌: {volume_stats['max_fall']:.4f}") print(f"上涨比例: {volume_stats['positive_ratio']:.1%}") print(f"平均最大获利: {volume_stats['avg_max_gain']:.4f}") print(f"平均最大亏损: {volume_stats['avg_max_loss']:.4f}") # 关键时间点分析 if avg_changes: print(f"\n关键时间点分析:") if length >= 500: key_points = [(9, '第10点'), (49, '第50点'), (199, '第200点'), (499, '第500点')] elif length >= 200: key_points = [(9, '第10点'), (49, '第50点'), (199, '第200点')] else: key_points = [(9, '第10点'), (49, '第50点')] for point, name in key_points: if point < len(avg_changes): print(f"{name}: {avg_changes[point]:.4f}", end="") if point < len(std_changes): print(f" (±{std_changes[point]:.4f})") else: print() plt.close('all') # 关闭所有图形以释放内存 print(f"\n{'='*60}") print("优化版分析完成!") print("信号抑制逻辑已成功应用,减少了重复信号的影响。") print(f"{'='*60}") if __name__ == "__main__": analyze_current_volume_optimized()