huice/large_orders/analyze_current_volume_optimized.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import os

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

def analyze_current_volume_optimized():
    """分析当前成交量大于150的数据点后不同长度的成交价走势（优化版：信号抑制）"""

    print("正在读取数据文件...")
    # 读取数据（从上级目录的data文件夹）
    df = pd.read_parquet('../data/au2512_20251013.parquet')

    print(f"数据总行数: {len(df)}")
    print(f"数据列名: {df.columns.tolist()}")

    # 查找当前成交量列的实际名称
    current_volume_col = None

    for col in df.columns:
        if '当前成交量' in str(col) or 'cur_volume' in str(col).lower() or '成交量' in str(col):
            if '累积' not in str(col):  # 排除累积成交量
                current_volume_col = col
                break

    if current_volume_col is None:
        print("未找到当前成交量列，尝试查找其他可能的成交量列...")
        # 如果没找到当前成交量，尝试其他可能的列名
        for col in df.columns:
            if '量' in str(col) and '累积' not in str(col) and '买' not in str(col) and '卖' not in str(col):
                current_volume_col = col
                print(f"使用可能的成交量列: {col}")
                break

    if current_volume_col is None:
        print("未找到合适的成交量列")
        return

    print(f"使用当前成交量列: {current_volume_col}")

    # 获取成交价列名
    price_col = None
    for col in df.columns:
        if '成交价' in str(col) or 'price' in str(col).lower():
            price_col = col
            break

    if price_col is None:
        print("未找到成交价列")
        return

    print(f"使用成交价列: {price_col}")

    # 显示当前成交量的统计信息
    print(f"\n当前成交量统计信息:")
    print(f"最小值: {df[current_volume_col].min()}")
    print(f"最大值: {df[current_volume_col].max()}")
    print(f"平均值: {df[current_volume_col].mean():.2f}")
    print(f"中位数: {df[current_volume_col].median():.2f}")

    # 筛选当前成交量大于150的数据点
    large_volume_mask = df[current_volume_col] > 150
    large_volume_indices = df[large_volume_mask].index.tolist()

    print(f"\n找到当前成交量大于150的数据点数量: {len(large_volume_indices)}")

    if len(large_volume_indices) > 0:
        large_volumes = df.loc[large_volume_indices, current_volume_col]
        print(f"大成交量统计: 最小={large_volumes.min():.0f}, 最大={large_volumes.max():.0f}, 平均={large_volumes.mean():.0f}")

    # 信号抑制逻辑：移除20个数据点内的重复信号
    def apply_signal_suppression(indices, suppression_window=20):
        """应用信号抑制逻辑，移除指定窗口内的重复信号"""
        if not indices:
            return []

        # 按索引排序
        sorted_indices = sorted(indices)
        filtered_indices = []
        suppressed_count = 0

        for i, idx in enumerate(sorted_indices):
            # 检查是否与前面的有效信号距离太近
            is_suppressed = False
            for prev_idx in filtered_indices:
                if idx - prev_idx <= suppression_window:
                    is_suppressed = True
                    suppressed_count += 1
                    break

            if not is_suppressed:
                filtered_indices.append(idx)

        return filtered_indices, suppressed_count

    # 应用信号抑制
    print("\n应用信号抑制逻辑（20个数据点窗口）...")
    filtered_indices, suppressed_count = apply_signal_suppression(large_volume_indices, 20)

    print(f"原始信号数量: {len(large_volume_indices)}")
    print(f"抑制后信号数量: {len(filtered_indices)}")
    print(f"被抑制的信号数量: {suppressed_count}")
    print(f"抑制率: {suppressed_count/len(large_volume_indices)*100:.1f}%")

    if len(filtered_indices) > 0:
        filtered_volumes = df.loc[filtered_indices, current_volume_col]
        print(f"过滤后大成交量统计: 最小={filtered_volumes.min():.0f}, 最大={filtered_volumes.max():.0f}, 平均={filtered_volumes.mean():.0f}")

    # 提取价格序列的函数
    def extract_price_sequences(indices, max_points):
        sequences = []
        sequence_info = []

        for idx in indices:
            remaining_points = len(df) - idx - 1
            take_points = min(max_points, remaining_points)

            if take_points > 0:
                base_price = df.loc[idx, price_col]
                future_prices = df.loc[idx + 1: idx + take_points, price_col].values
                price_changes = future_prices - base_price
                sequences.append(price_changes)

                sequence_info.append({
                    'start_index': idx,
                    'volume': df.loc[idx, current_volume_col],
                    'base_price': base_price,
                    'sequence_length': take_points
                })

        return sequences, sequence_info

    # 分析不同时间长度的数据
    analysis_lengths = [100, 200, 500]  # 100, 200, 500个数据点

    for length in analysis_lengths:
        print(f"\n{'='*60}")
        print(f"分析当前成交量>150后{length}个数据点的价格走势（优化版）")
        print(f"{'='*60}")

        # 提取过滤后大成交量的价格序列
        volume_sequences, volume_info = extract_price_sequences(filtered_indices, length)

        print(f"成功提取 {len(volume_sequences)} 个过滤后大成交量价格序列 (最大长度: {length})")

        # 创建综合分析图表
        fig, axes = plt.subplots(2, 2, figsize=(20, 16))
        fig.suptitle(f'当前成交量>150的价格走势分析（优化版）(后{length}个数据点)\n原始信号:{len(large_volume_indices)}个 → 过滤后:{len(filtered_indices)}个 (抑制{suppressed_count}个)',
                     fontsize=14, fontweight='bold')

        # 1. 过滤后大成交量价格变化图（所有序列）
        ax1 = axes[0, 0]
        if volume_sequences:
            # 使用渐变色
            colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(volume_sequences)))
            for i, (sequence, info) in enumerate(zip(volume_sequences, volume_info)):
                x_axis = range(len(sequence))
                ax1.plot(x_axis, sequence, color=colors[i], alpha=0.6, linewidth=0.8)

        ax1.axhline(y=0, color='red', linestyle='--', alpha=0.7, linewidth=1.5)
        ax1.set_xlabel('数据点序号')
        ax1.set_ylabel('相对价格变化')
        ax1.set_title(f'过滤后价格变化走势 (后{length}点)\n共{len(volume_sequences)}个有效序列')
        ax1.grid(True, alpha=0.3)

        # 添加信号抑制信息
        suppression_text = f'原始: {len(large_volume_indices)}个\n过滤: {len(filtered_indices)}个\n抑制: {suppressed_count}个'
        ax1.text(0.02, 0.98, suppression_text, transform=ax1.transAxes,
                 verticalalignment='top', bbox=dict(boxstyle='round', facecolor='lightgreen', alpha=0.8))

        # 2. 按成交量大小分组显示（前20% vs 后80%）
        ax2 = axes[0, 1]
        if len(volume_info) > 0:
            # 按成交量排序
            sorted_indices = sorted(range(len(volume_info)), key=lambda i: volume_info[i]['volume'], reverse=True)
            top_20_percent = max(1, len(sorted_indices) // 5)  # 至少1个

            top_sequences = [volume_sequences[i] for i in sorted_indices[:top_20_percent]]
            bottom_sequences = [volume_sequences[i] for i in sorted_indices[top_20_percent:]]

            # 显示最大的20%成交量序列（红色）
            for i, sequence in enumerate(top_sequences):
                x_axis = range(len(sequence))
                ax2.plot(x_axis, sequence, color='red', alpha=0.7, linewidth=1.2,
                        label='最大20%成交量' if i == 0 else "")

            # 显示较小的80%成交量序列（蓝色）
            for i, sequence in enumerate(bottom_sequences):
                x_axis = range(len(sequence))
                ax2.plot(x_axis, sequence, color='blue', alpha=0.4, linewidth=0.6,
                        label='其他80%成交量' if i == 0 else "")

        ax2.axhline(y=0, color='red', linestyle='--', alpha=0.7, linewidth=1.5)
        ax2.set_xlabel('数据点序号')
        ax2.set_ylabel('相对价格变化')
        ax2.set_title(f'按成交量大小分组的价格走势 (后{length}点)\n红色:最大20%({len(top_sequences) if len(volume_info) > 0 else 0}个) 蓝色:其他80%({len(bottom_sequences) if len(volume_info) > 0 else 0}个)')
        ax2.grid(True, alpha=0.3)
        if len(volume_info) > 0:
            ax2.legend(fontsize=10)

        # 3. 平均变化和置信区间
        ax3 = axes[1, 0]

        def calculate_avg_changes_and_std(sequences):
            if not sequences:
                return [], []
            max_len = max(len(seq) for seq in sequences)
            avg_changes = []
            std_changes = []
            for i in range(max_len):
                point_changes = [seq[i] for seq in sequences if len(seq) > i]
                if point_changes:
                    avg_changes.append(np.mean(point_changes))
                    std_changes.append(np.std(point_changes))
            return avg_changes, std_changes

        avg_changes, std_changes = calculate_avg_changes_and_std(volume_sequences)

        if avg_changes:
            x_axis = range(len(avg_changes))
            ax3.plot(x_axis, avg_changes, color='green', linewidth=2.5, label=f'平均变化 (n={len(volume_sequences)})')

            # 添加置信区间（±1个标准差）
            upper_bound = [avg + std for avg, std in zip(avg_changes, std_changes)]
            lower_bound = [avg - std for avg, std in zip(avg_changes, std_changes)]
            ax3.fill_between(x_axis, lower_bound, upper_bound, alpha=0.3, color='green', label='±1标准差区间')

        ax3.axhline(y=0, color='black', linestyle='--', alpha=0.7, linewidth=1.5)
        ax3.set_xlabel('数据点序号')
        ax3.set_ylabel('平均相对价格变化')
        ax3.set_title(f'平均价格变化及置信区间 (后{length}点)')
        ax3.legend(fontsize=12)
        ax3.grid(True, alpha=0.3)

        # 4. 统计信息文本框
        ax4 = axes[1, 1]
        ax4.axis('off')

        # 计算统计信息
        def calculate_stats(sequences):
            if not sequences:
                return {}
            final_changes = [seq[-1] for seq in sequences if len(seq) > 0]
            if final_changes:
                return {
                    'count': len(sequences),
                    'avg_final_change': np.mean(final_changes),
                    'std_final_change': np.std(final_changes),
                    'max_rise': np.max(final_changes),
                    'max_fall': np.min(final_changes),
                    'positive_ratio': sum(1 for change in final_changes if change > 0) / len(final_changes),
                    'avg_max_gain': np.mean([np.max(seq) for seq in sequences if len(seq) > 0]),
                    'avg_max_loss': np.mean([np.min(seq) for seq in sequences if len(seq) > 0])
                }
            return {}

        volume_stats = calculate_stats(volume_sequences)

        # 显示统计信息
        stats_text = f"=== 当前成交量>150 统计信息 (后{length}点) ===\n"
        stats_text += f"信号抑制效果:\n"
        stats_text += f"  原始信号: {len(large_volume_indices)}个\n"
        stats_text += f"  过滤信号: {len(filtered_indices)}个\n"
        stats_text += f"  抑制数量: {suppressed_count}个\n"
        stats_text += f"  抑制率: {suppressed_count/len(large_volume_indices)*100:.1f}%\n\n"

        if volume_stats:
            stats_text += f"价格统计:\n"
            stats_text += f"  序列数量: {volume_stats['count']}\n"
            stats_text += f"  平均最终变化: {volume_stats['avg_final_change']:.4f}\n"
            stats_text += f"  变化标准差: {volume_stats['std_final_change']:.4f}\n"
            stats_text += f"  最大上涨: {volume_stats['max_rise']:.4f}\n"
            stats_text += f"  最大下跌: {volume_stats['max_fall']:.4f}\n"
            stats_text += f"  上涨比例: {volume_stats['positive_ratio']:.1%}\n"
            stats_text += f"  平均最大获利: {volume_stats['avg_max_gain']:.4f}\n"
            stats_text += f"  平均最大亏损: {volume_stats['avg_max_loss']:.4f}\n\n"

        # 添加关键时间点分析
        if avg_changes:
            stats_text += f"=== 关键时间点分析 ===\n"
            if length >= 500:
                points_to_check = [9, 49, 199, 499]  # 第10、50、200、500点
                point_names = ['第10点', '第50点', '第200点', '第500点']
            elif length >= 200:
                points_to_check = [9, 49, 199]  # 第10、50、200点
                point_names = ['第10点', '第50点', '第200点']
            else:
                points_to_check = [9, 49]  # 第10、50点
                point_names = ['第10点', '第50点']

            for i, point in enumerate(points_to_check):
                if point < len(avg_changes):
                    stats_text += f"  {point_names[i]}: {avg_changes[point]:.4f}"
                    if point < len(std_changes):
                        stats_text += f" (±{std_changes[point]:.4f})"
                    stats_text += "\n"

        # 添加成交量信息
        if len(volume_info) > 0:
            volumes = [info['volume'] for info in volume_info]
            stats_text += f"\n=== 成交量信息 ===\n"
            stats_text += f"  成交量范围: {min(volumes):.0f} - {max(volumes):.0f}\n"
            stats_text += f"  平均成交量: {np.mean(volumes):.0f}\n"
            stats_text += f"  成交量中位数: {np.median(volumes):.0f}"

        ax4.text(0.05, 0.95, stats_text, transform=ax4.transAxes, fontsize=10,
                 verticalalignment='top',
                 bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.8))

        plt.tight_layout()

        # 保存综合图表
        output_file = f'current_volume_optimized_comprehensive_analysis_{length}points.png'
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        print(f"\n{length}点优化版综合分析图表已保存为: {output_file}")
        print(f"完整路径: {os.path.abspath(output_file)}")

        # 保存单独的过滤后大成交量走势图
        fig_volume, ax_volume = plt.subplots(figsize=(15, 10))
        if volume_sequences:
            colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(volume_sequences)))
            for i, (sequence, info) in enumerate(zip(volume_sequences, volume_info)):
                x_axis = range(len(sequence))
                ax_volume.plot(x_axis, sequence, color=colors[i], alpha=0.6, linewidth=1)

        ax_volume.axhline(y=0, color='red', linestyle='--', alpha=0.7, linewidth=1.5)
        ax_volume.set_xlabel(f'数据点序号 (相对于过滤后大成交量时刻, 后{length}点)', fontsize=12)
        ax_volume.set_ylabel('相对价格变化 (相对于基准点)', fontsize=12)
        ax_volume.set_title(f'当前成交量>150过滤后数据点{length}个相对价格变化走势\n信号抑制: {len(large_volume_indices)}→{len(filtered_indices)} (抑制{suppressed_count}个)\n共{len(volume_sequences)}个有效序列',
                           fontsize=14, fontweight='bold')
        ax_volume.grid(True, alpha=0.3)
        plt.tight_layout()

        volume_output_file = f'current_volume_optimized_relative_price_changes_{length}points.png'
        plt.savefig(volume_output_file, dpi=300, bbox_inches='tight')
        print(f"过滤后大成交量{length}点分析图表已保存为: {volume_output_file}")
        plt.close()

        # 显示统计信息
        print(f"\n{'='*50}")
        print(f"详细统计信息 (后{length}点) - 优化版:")
        print(f"{'='*50}")

        print(f"\n【信号抑制效果】")
        print(f"原始信号数量: {len(large_volume_indices)}")
        print(f"过滤后信号数量: {len(filtered_indices)}")
        print(f"被抑制信号数量: {suppressed_count}")
        print(f"信号抑制率: {suppressed_count/len(large_volume_indices)*100:.1f}%")

        if volume_stats:
            print(f"\n【过滤后价格统计】")
            print(f"序列数量: {volume_stats['count']}")
            print(f"平均最终变化: {volume_stats['avg_final_change']:.4f}")
            print(f"变化标准差: {volume_stats['std_final_change']:.4f}")
            print(f"最大上涨: {volume_stats['max_rise']:.4f}")
            print(f"最大下跌: {volume_stats['max_fall']:.4f}")
            print(f"上涨比例: {volume_stats['positive_ratio']:.1%}")
            print(f"平均最大获利: {volume_stats['avg_max_gain']:.4f}")
            print(f"平均最大亏损: {volume_stats['avg_max_loss']:.4f}")

        # 关键时间点分析
        if avg_changes:
            print(f"\n关键时间点分析:")
            if length >= 500:
                key_points = [(9, '第10点'), (49, '第50点'), (199, '第200点'), (499, '第500点')]
            elif length >= 200:
                key_points = [(9, '第10点'), (49, '第50点'), (199, '第200点')]
            else:
                key_points = [(9, '第10点'), (49, '第50点')]

            for point, name in key_points:
                if point < len(avg_changes):
                    print(f"{name}: {avg_changes[point]:.4f}", end="")
                    if point < len(std_changes):
                        print(f" (±{std_changes[point]:.4f})")
                    else:
                        print()

        plt.close('all')  # 关闭所有图形以释放内存

    print(f"\n{'='*60}")
    print("优化版分析完成！")
    print("信号抑制逻辑已成功应用，减少了重复信号的影响。")
    print(f"{'='*60}")

if __name__ == "__main__":
    analyze_current_volume_optimized()