버그 수정 및 해커 그룹 분리 코드 추가

2025-08-19 12:31:04 +09:00
parent 6654bfe4a2
commit 2765844650
5 changed files with 198 additions and 3 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,3 @@
 /__pycache__/
-*.txt
+checkpoint.txt
 *.csv
--- a/README.md
+++ b/README.md
@ -0,0 +1,12 @@
 1. 데이터를 수집
 다음 명령을 실행
 `python crawler.py`
 주의점 : 성능 문제로 한번에 모든 데이터를 수집하지 않는다. 더이상 데이터가 수집되지 않을때까지 반복실행하면 여러개의 csv 가 생성된다.
 2. 해커 분류
 다음 명령을 실행
 `python score.py 파일|폴더`
 수집된 csv 파일을 인자로 주면 해당 파일을 분석하여 결과를 hack_group.csv 에 저장한다
 폴더를 입력하면 폴더안에 있는 모든 csv 를 고려하여 결과를 분석한다
--- a/crawler.py
+++ b/crawler.py
@ -444,6 +444,29 @@ def find_skill_uses_for_validation(
    return skill_use_set
 def collapse_duplicated(data):
    # 그룹핑 key: uid + timestamp + xyz + bef_xyz
    def make_key(item):
        return (
            item["uid"],
            item["@timestamp"],
            tuple(item["body"]["xyz"]),
            tuple(item["body"]["bef_xyz"])
        )
    grouped = defaultdict(list)
    for d in data:
        grouped[make_key(d)].append(d)
    collapsed = []
    for _, items in grouped.items():
        base = items[0].copy()
        base["duplicated"] = len(items)
        collapsed.append(base)
    return collapsed
 # =========================
 # 2) 저장 전용 (분석 없음)
 # =========================
@ -469,10 +492,11 @@ def save_verified_batches_to_csv(
        for verified_hack_logs in verified_batches_iter:
            summary_data = defaultdict(lambda: defaultdict(int))        
-            for log in verified_hack_logs:
+            collapsed = collapse_duplicated(verified_hack_logs)
            for log in collapsed:
                uid = log.get('uid')
                if uid:
-                    hack_type = len(log)
+                    hack_type = log.get('duplicated')
                    summary_data[uid][hack_type] += 1
            uids_to_lookup = {log['uid'] for log in verified_hack_logs if 'uid' in log}
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,4 @@
 opensearch-py>=2.6.0
 requests>=2.31.0
 urllib3>=1.26.0
 scikit-learn
--- a/score.py
+++ b/score.py
@ -0,0 +1,155 @@
 #!/usr/bin/env python3
 import os
 import glob
 import argparse
 from collections import Counter
 import pandas as pd
 import numpy as np
 from sklearn.preprocessing import StandardScaler
 from sklearn.mixture import GaussianMixture
 from sklearn.cluster import KMeans
 from sklearn.svm import OneClassSVM
 # -----------------------------
 # 기존 로직 (수정/보완 없이 그대로 사용)
 # -----------------------------
 def most_frequent(series):
    vals = [v for v in series if pd.notna(v)]
    return Counter(vals).most_common(1)[0][0] if vals else np.nan
 def build_uid_features(df, uid_col="uid", type_col="type", count_col="count", nick_col="nickname"):
    """uid별 집계 피처 생성"""
    agg = (
        df.groupby(uid_col, as_index=False)
          .agg(
              nickname=(nick_col, most_frequent) if nick_col in df.columns else (uid_col, "size"),
              total_count=(count_col, "sum"),
              unique_types=(type_col, "nunique"),
              rows=(uid_col, "size")
          )
    )
    return agg
 def score_with_type_weight(agg, w_count=1.0, w_type=1.5):
    """z-정규화 후 가중합 스코어"""
    feats = agg[["total_count", "unique_types"]].values
    scaler = StandardScaler()
    Z = scaler.fit_transform(feats)
    score = w_count * Z[:, 0] + w_type * Z[:, 1] * agg["unique_types"].to_numpy()
    agg = agg.copy()
    agg["score"] = score
    return agg
 def split_groups(agg, method="gmm", quantile=0.5, random_state=42):
    """
    method:
      - 'gmm' : GaussianMixture 2컴포넌트로 HIGH/LOW
      - 'kmeans' : KMeans(k=2)로 클러스터링
      - 'quantile' : score 기준 분위수로 분리(기본: 중위수)
      - 'oneclass' : One-Class SVM으로 'HIGH'를 다수/정상군으로, 나머지 'LOW'
    """
    agg = agg.copy()
    s = agg["score"].to_numpy().reshape(-1, 1)
    if method == "gmm":
        gmm = GaussianMixture(n_components=2, random_state=random_state)
        labels = gmm.fit_predict(s)
        means = gmm.means_.flatten()
        high_component = int(np.argmax(means))
        agg["group"] = np.where(labels == high_component, "HIGH", "LOW")
    elif method == "kmeans":
        km = KMeans(n_clusters=2, n_init="auto", random_state=random_state)
        labels = km.fit_predict(s)
        centers = km.cluster_centers_.flatten()
        high_component = int(np.argmax(centers))
        agg["group"] = np.where(labels == high_component, "HIGH", "LOW")
    elif method == "quantile":
        thr = np.quantile(agg["score"], quantile)
        agg["group"] = np.where(agg["score"] >= thr, "HIGH", "LOW")
    elif method == "oneclass":
        oc = OneClassSVM(kernel="rbf", gamma="scale", nu=0.1)  # 필요 시 nu 조정
        oc.fit(s)
        pred = oc.predict(s)  # 1: inliers, -1: outliers
        agg["group"] = np.where(pred == 1, "HIGH", "LOW")
    else:
        raise ValueError("method must be one of: gmm, kmeans, quantile, oneclass")
    return agg
 # -----------------------------
 # 추가: 입력 경로(파일/폴더) 처리 유틸
 # -----------------------------
 def load_input_path(path: str, pattern: str = "*.csv", recursive: bool = True) -> pd.DataFrame:
    """
    - path가 파일이면 해당 CSV만 로드
    - path가 폴더이면 내부의 CSV들을 전부 로드하여 concat
    """
    if os.path.isdir(path):
        glob_pattern = os.path.join(path, "**", pattern) if recursive else os.path.join(path, pattern)
        files = sorted(glob.glob(glob_pattern, recursive=recursive))
        if not files:
            raise FileNotFoundError(f"No CSV files found under directory: {path}")
        dfs = []
        for f in files:
            try:
                dfs.append(pd.read_csv(f))
            except Exception as e:
                raise RuntimeError(f"Failed to read '{f}': {e}")
        df = pd.concat(dfs, ignore_index=True)
    else:
        if not os.path.exists(path):
            raise FileNotFoundError(f"Path not found: {path}")
        df = pd.read_csv(path)
    return df
 # -----------------------------
 # 실행 엔트리포인트
 # -----------------------------
 def main():
    parser = argparse.ArgumentParser(description="UID 그룹 스코어링 및 분리 (파일 또는 폴더 입력)")
    parser.add_argument("path", help="입력 CSV 파일 경로 또는 폴더 경로")
    parser.add_argument("-o", "--out", default="hack_group.csv", help="출력 CSV 파일명 (기본: hack_group.csv)")
    parser.add_argument("--pattern", default="*.csv", help="폴더 입력 시 읽을 파일 패턴 (기본: *.csv)")
    parser.add_argument("--no-recursive", action="store_true", help="폴더 입력 시 하위 디렉토리 재귀 탐색 비활성화")
    parser.add_argument("--uid-col", default="uid", help="UID 컬럼명 (기본: uid)")
    parser.add_argument("--type-col", default="type", help="TYPE 컬럼명 (기본: type)")
    parser.add_argument("--count-col", default="count", help="COUNT 컬럼명 (기본: count)")
    parser.add_argument("--nick-col", default="nickname", help="닉네임 컬럼명 (기본: nickname)")
    parser.add_argument("--w-count", type=float, default=1.0, help="스코어 가중치: count (기본: 1.0)")
    parser.add_argument("--w-type", type=float, default=1.5, help="스코어 가중치: unique_types (기본: 1.5)")
    parser.add_argument("--method", choices=["gmm", "kmeans", "quantile", "oneclass"], default="gmm",
                        help="그룹 분리 방법 (기본: gmm)")
    parser.add_argument("--quantile", type=float, default=0.5, help="method=quantile 사용 시 분위수(0~1)")
    parser.add_argument("--random-state", type=int, default=42, help="랜덤 시드 (기본: 42)")
    args = parser.parse_args()
    # 1) 입력 로드 (파일/폴더)
    df = load_input_path(args.path, pattern=args.pattern, recursive=(not args.no_recursive))
    if df.empty:
        raise ValueError("Loaded dataframe is empty. Check input path or files.")
    # 2) 집계 → 스코어 → 그룹 분리
    agg = build_uid_features(df, uid_col=args.uid_col, type_col=args.type_col,
                             count_col=args.count_col, nick_col=args.nick_col)
    agg = score_with_type_weight(agg, w_count=args.w_count, w_type=args.w_type)
    if args.method == "quantile":
        result = split_groups(agg, method=args.method, quantile=args.quantile, random_state=args.random_state)
    else:
        result = split_groups(agg, method=args.method, random_state=args.random_state)
    # 3) 정렬 및 저장
    result = result.sort_values(["group", "score"], ascending=[True, False])
    result.to_csv(args.out, index=False)
    print(f"[OK] input rows={len(df):,}, uids={result.shape[0]:,} → saved: {args.out}")
 if __name__ == "__main__":
    main()