opensearch-tracker/score.py

#!/usr/bin/env python3
import os
import glob
import argparse
from collections import Counter

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from sklearn.svm import OneClassSVM


# -----------------------------
# 기존 로직 (수정/보완 없이 그대로 사용)
# -----------------------------
def most_frequent(series):
    vals = [v for v in series if pd.notna(v)]
    return Counter(vals).most_common(1)[0][0] if vals else np.nan

def build_uid_features(df, uid_col="uid", type_col="type", count_col="count", nick_col="nickname"):
    """uid별 집계 피처 생성"""
    agg = (
        df.groupby(uid_col, as_index=False)
          .agg(
              nickname=(nick_col, most_frequent) if nick_col in df.columns else (uid_col, "size"),
              total_count=(count_col, "sum"),
              unique_types=(type_col, "nunique"),
              rows=(uid_col, "size")
          )
    )
    return agg

def score_with_type_weight(agg, w_count=1.0, w_type=1.5):
    """z-정규화 후 가중합 스코어"""
    feats = agg[["total_count", "unique_types"]].values
    scaler = StandardScaler()
    Z = scaler.fit_transform(feats)
    score = w_count * Z[:, 0] + w_type * Z[:, 1] * agg["unique_types"].to_numpy()
    agg = agg.copy()
    agg["score"] = score
    return agg

def split_groups(agg, method="gmm", quantile=0.5, random_state=42):
    """
    method:
      - 'gmm' : GaussianMixture 2컴포넌트로 HIGH/LOW
      - 'kmeans' : KMeans(k=2)로 클러스터링
      - 'quantile' : score 기준 분위수로 분리(기본: 중위수)
      - 'oneclass' : One-Class SVM으로 'HIGH'를 다수/정상군으로, 나머지 'LOW'
    """
    agg = agg.copy()
    s = agg["score"].to_numpy().reshape(-1, 1)

    if method == "gmm":
        gmm = GaussianMixture(n_components=2, random_state=random_state)
        labels = gmm.fit_predict(s)
        means = gmm.means_.flatten()
        high_component = int(np.argmax(means))
        agg["group"] = np.where(labels == high_component, "HIGH", "LOW")

    elif method == "kmeans":
        km = KMeans(n_clusters=2, n_init="auto", random_state=random_state)
        labels = km.fit_predict(s)
        centers = km.cluster_centers_.flatten()
        high_component = int(np.argmax(centers))
        agg["group"] = np.where(labels == high_component, "HIGH", "LOW")

    elif method == "quantile":
        thr = np.quantile(agg["score"], quantile)
        agg["group"] = np.where(agg["score"] >= thr, "HIGH", "LOW")

    elif method == "oneclass":
        oc = OneClassSVM(kernel="rbf", gamma="scale", nu=0.1)  # 필요 시 nu 조정
        oc.fit(s)
        pred = oc.predict(s)  # 1: inliers, -1: outliers
        agg["group"] = np.where(pred == 1, "HIGH", "LOW")

    else:
        raise ValueError("method must be one of: gmm, kmeans, quantile, oneclass")

    return agg


# -----------------------------
# 추가: 입력 경로(파일/폴더) 처리 유틸
# -----------------------------
def load_input_path(path: str, pattern: str = "*.csv", recursive: bool = True) -> pd.DataFrame:
    """
    - path가 파일이면 해당 CSV만 로드
    - path가 폴더이면 내부의 CSV들을 전부 로드하여 concat
    """
    if os.path.isdir(path):
        glob_pattern = os.path.join(path, "**", pattern) if recursive else os.path.join(path, pattern)
        files = sorted(glob.glob(glob_pattern, recursive=recursive))
        if not files:
            raise FileNotFoundError(f"No CSV files found under directory: {path}")
        dfs = []
        for f in files:
            try:
                dfs.append(pd.read_csv(f))
            except Exception as e:
                raise RuntimeError(f"Failed to read '{f}': {e}")
        df = pd.concat(dfs, ignore_index=True)
    else:
        if not os.path.exists(path):
            raise FileNotFoundError(f"Path not found: {path}")
        df = pd.read_csv(path)
    return df


# -----------------------------
# 실행 엔트리포인트
# -----------------------------
def main():
    parser = argparse.ArgumentParser(description="UID 그룹 스코어링 및 분리 (파일 또는 폴더 입력)")
    parser.add_argument("path", help="입력 CSV 파일 경로 또는 폴더 경로")
    parser.add_argument("-o", "--out", default="hack_group.csv", help="출력 CSV 파일명 (기본: hack_group.csv)")
    parser.add_argument("--pattern", default="*.csv", help="폴더 입력 시 읽을 파일 패턴 (기본: *.csv)")
    parser.add_argument("--no-recursive", action="store_true", help="폴더 입력 시 하위 디렉토리 재귀 탐색 비활성화")
    parser.add_argument("--uid-col", default="uid", help="UID 컬럼명 (기본: uid)")
    parser.add_argument("--type-col", default="type", help="TYPE 컬럼명 (기본: type)")
    parser.add_argument("--count-col", default="count", help="COUNT 컬럼명 (기본: count)")
    parser.add_argument("--nick-col", default="nickname", help="닉네임 컬럼명 (기본: nickname)")
    parser.add_argument("--w-count", type=float, default=1.0, help="스코어 가중치: count (기본: 1.0)")
    parser.add_argument("--w-type", type=float, default=1.5, help="스코어 가중치: unique_types (기본: 1.5)")
    parser.add_argument("--method", choices=["gmm", "kmeans", "quantile", "oneclass"], default="gmm",
                        help="그룹 분리 방법 (기본: gmm)")
    parser.add_argument("--quantile", type=float, default=0.5, help="method=quantile 사용 시 분위수(0~1)")
    parser.add_argument("--random-state", type=int, default=42, help="랜덤 시드 (기본: 42)")
    args = parser.parse_args()

    # 1) 입력 로드 (파일/폴더)
    df = load_input_path(args.path, pattern=args.pattern, recursive=(not args.no_recursive))
    if df.empty:
        raise ValueError("Loaded dataframe is empty. Check input path or files.")

    # 2) 집계 → 스코어 → 그룹 분리
    agg = build_uid_features(df, uid_col=args.uid_col, type_col=args.type_col,
                             count_col=args.count_col, nick_col=args.nick_col)
    agg = score_with_type_weight(agg, w_count=args.w_count, w_type=args.w_type)
    if args.method == "quantile":
        result = split_groups(agg, method=args.method, quantile=args.quantile, random_state=args.random_state)
    else:
        result = split_groups(agg, method=args.method, random_state=args.random_state)

    # 3) 정렬 및 저장
    result = result.sort_values(["group", "score"], ascending=[True, False])
    result.to_csv(args.out, index=False)
    print(f"[OK] input rows={len(df):,}, uids={result.shape[0]:,} → saved: {args.out}")


if __name__ == "__main__":
    main()