From 2765844650c3f2360890751c537a04fb2c4c38d4 Mon Sep 17 00:00:00 2001
From: Langley <langley0@naver.com>
Date: Tue, 19 Aug 2025 12:31:04 +0900
Subject: [PATCH] =?UTF-8?q?=EB=B2=84=EA=B7=B8=20=EC=88=98=EC=A0=95=20?=
 =?UTF-8?q?=EB=B0=8F=20=ED=95=B4=EC=BB=A4=20=EA=B7=B8=EB=A3=B9=20=EB=B6=84?=
 =?UTF-8?q?=EB=A6=AC=20=EC=BD=94=EB=93=9C=20=EC=B6=94=EA=B0=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore       |   2 +-
 README.md        |  12 ++++
 crawler.py       |  28 ++++++++-
 requirements.txt |   4 ++
 score.py         | 155 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 198 insertions(+), 3 deletions(-)
 create mode 100644 README.md
 create mode 100644 requirements.txt
 create mode 100644 score.py

diff --git a/.gitignore b/.gitignore
index ec7f06b..744ad57 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,3 @@
 /__pycache__/
-*.txt
+checkpoint.txt
 *.csv
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..fb2d096
--- /dev/null
+++ b/README.md
@@ -0,0 +1,12 @@
+1. 데이터를 수집
+다음 명령을 실행
+`python crawler.py`
+
+주의점 : 성능 문제로 한번에 모든 데이터를 수집하지 않는다. 더이상 데이터가 수집되지 않을때까지 반복실행하면 여러개의 csv 가 생성된다.
+
+2. 해커 분류
+다음 명령을 실행
+`python score.py 파일|폴더`
+
+수집된 csv 파일을 인자로 주면 해당 파일을 분석하여 결과를 hack_group.csv 에 저장한다
+폴더를 입력하면 폴더안에 있는 모든 csv 를 고려하여 결과를 분석한다
\ No newline at end of file
diff --git a/crawler.py b/crawler.py
index 3d5a395..5349704 100644
--- a/crawler.py
+++ b/crawler.py
@@ -444,6 +444,29 @@ def find_skill_uses_for_validation(
     return skill_use_set
 
 
+def collapse_duplicated(data):
+    # 그룹핑 key: uid + timestamp + xyz + bef_xyz
+    def make_key(item):
+        return (
+            item["uid"],
+            item["@timestamp"],
+            tuple(item["body"]["xyz"]),
+            tuple(item["body"]["bef_xyz"])
+        )
+
+    grouped = defaultdict(list)
+    for d in data:
+        grouped[make_key(d)].append(d)
+
+    collapsed = []
+    for _, items in grouped.items():
+        base = items[0].copy()
+        base["duplicated"] = len(items)
+        collapsed.append(base)
+
+    return collapsed
+
+
 # =========================
 # 2) 저장 전용 (분석 없음)
 # =========================
@@ -469,10 +492,11 @@ def save_verified_batches_to_csv(
         for verified_hack_logs in verified_batches_iter:
             summary_data = defaultdict(lambda: defaultdict(int))        
 
-            for log in verified_hack_logs:
+            collapsed = collapse_duplicated(verified_hack_logs)
+            for log in collapsed:
                 uid = log.get('uid')
                 if uid:
-                    hack_type = len(log)
+                    hack_type = log.get('duplicated')
                     summary_data[uid][hack_type] += 1
 
             uids_to_lookup = {log['uid'] for log in verified_hack_logs if 'uid' in log}
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..3a2f623
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,4 @@
+opensearch-py>=2.6.0
+requests>=2.31.0
+urllib3>=1.26.0
+scikit-learn
\ No newline at end of file
diff --git a/score.py b/score.py
new file mode 100644
index 0000000..fd33e38
--- /dev/null
+++ b/score.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+import os
+import glob
+import argparse
+from collections import Counter
+
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from sklearn.mixture import GaussianMixture
+from sklearn.cluster import KMeans
+from sklearn.svm import OneClassSVM
+
+
+# -----------------------------
+# 기존 로직 (수정/보완 없이 그대로 사용)
+# -----------------------------
+def most_frequent(series):
+    vals = [v for v in series if pd.notna(v)]
+    return Counter(vals).most_common(1)[0][0] if vals else np.nan
+
+def build_uid_features(df, uid_col="uid", type_col="type", count_col="count", nick_col="nickname"):
+    """uid별 집계 피처 생성"""
+    agg = (
+        df.groupby(uid_col, as_index=False)
+          .agg(
+              nickname=(nick_col, most_frequent) if nick_col in df.columns else (uid_col, "size"),
+              total_count=(count_col, "sum"),
+              unique_types=(type_col, "nunique"),
+              rows=(uid_col, "size")
+          )
+    )
+    return agg
+
+def score_with_type_weight(agg, w_count=1.0, w_type=1.5):
+    """z-정규화 후 가중합 스코어"""
+    feats = agg[["total_count", "unique_types"]].values
+    scaler = StandardScaler()
+    Z = scaler.fit_transform(feats)
+    score = w_count * Z[:, 0] + w_type * Z[:, 1] * agg["unique_types"].to_numpy()
+    agg = agg.copy()
+    agg["score"] = score
+    return agg
+
+def split_groups(agg, method="gmm", quantile=0.5, random_state=42):
+    """
+    method:
+      - 'gmm' : GaussianMixture 2컴포넌트로 HIGH/LOW
+      - 'kmeans' : KMeans(k=2)로 클러스터링
+      - 'quantile' : score 기준 분위수로 분리(기본: 중위수)
+      - 'oneclass' : One-Class SVM으로 'HIGH'를 다수/정상군으로, 나머지 'LOW'
+    """
+    agg = agg.copy()
+    s = agg["score"].to_numpy().reshape(-1, 1)
+
+    if method == "gmm":
+        gmm = GaussianMixture(n_components=2, random_state=random_state)
+        labels = gmm.fit_predict(s)
+        means = gmm.means_.flatten()
+        high_component = int(np.argmax(means))
+        agg["group"] = np.where(labels == high_component, "HIGH", "LOW")
+
+    elif method == "kmeans":
+        km = KMeans(n_clusters=2, n_init="auto", random_state=random_state)
+        labels = km.fit_predict(s)
+        centers = km.cluster_centers_.flatten()
+        high_component = int(np.argmax(centers))
+        agg["group"] = np.where(labels == high_component, "HIGH", "LOW")
+
+    elif method == "quantile":
+        thr = np.quantile(agg["score"], quantile)
+        agg["group"] = np.where(agg["score"] >= thr, "HIGH", "LOW")
+
+    elif method == "oneclass":
+        oc = OneClassSVM(kernel="rbf", gamma="scale", nu=0.1)  # 필요 시 nu 조정
+        oc.fit(s)
+        pred = oc.predict(s)  # 1: inliers, -1: outliers
+        agg["group"] = np.where(pred == 1, "HIGH", "LOW")
+
+    else:
+        raise ValueError("method must be one of: gmm, kmeans, quantile, oneclass")
+
+    return agg
+
+
+# -----------------------------
+# 추가: 입력 경로(파일/폴더) 처리 유틸
+# -----------------------------
+def load_input_path(path: str, pattern: str = "*.csv", recursive: bool = True) -> pd.DataFrame:
+    """
+    - path가 파일이면 해당 CSV만 로드
+    - path가 폴더이면 내부의 CSV들을 전부 로드하여 concat
+    """
+    if os.path.isdir(path):
+        glob_pattern = os.path.join(path, "**", pattern) if recursive else os.path.join(path, pattern)
+        files = sorted(glob.glob(glob_pattern, recursive=recursive))
+        if not files:
+            raise FileNotFoundError(f"No CSV files found under directory: {path}")
+        dfs = []
+        for f in files:
+            try:
+                dfs.append(pd.read_csv(f))
+            except Exception as e:
+                raise RuntimeError(f"Failed to read '{f}': {e}")
+        df = pd.concat(dfs, ignore_index=True)
+    else:
+        if not os.path.exists(path):
+            raise FileNotFoundError(f"Path not found: {path}")
+        df = pd.read_csv(path)
+    return df
+
+
+# -----------------------------
+# 실행 엔트리포인트
+# -----------------------------
+def main():
+    parser = argparse.ArgumentParser(description="UID 그룹 스코어링 및 분리 (파일 또는 폴더 입력)")
+    parser.add_argument("path", help="입력 CSV 파일 경로 또는 폴더 경로")
+    parser.add_argument("-o", "--out", default="hack_group.csv", help="출력 CSV 파일명 (기본: hack_group.csv)")
+    parser.add_argument("--pattern", default="*.csv", help="폴더 입력 시 읽을 파일 패턴 (기본: *.csv)")
+    parser.add_argument("--no-recursive", action="store_true", help="폴더 입력 시 하위 디렉토리 재귀 탐색 비활성화")
+    parser.add_argument("--uid-col", default="uid", help="UID 컬럼명 (기본: uid)")
+    parser.add_argument("--type-col", default="type", help="TYPE 컬럼명 (기본: type)")
+    parser.add_argument("--count-col", default="count", help="COUNT 컬럼명 (기본: count)")
+    parser.add_argument("--nick-col", default="nickname", help="닉네임 컬럼명 (기본: nickname)")
+    parser.add_argument("--w-count", type=float, default=1.0, help="스코어 가중치: count (기본: 1.0)")
+    parser.add_argument("--w-type", type=float, default=1.5, help="스코어 가중치: unique_types (기본: 1.5)")
+    parser.add_argument("--method", choices=["gmm", "kmeans", "quantile", "oneclass"], default="gmm",
+                        help="그룹 분리 방법 (기본: gmm)")
+    parser.add_argument("--quantile", type=float, default=0.5, help="method=quantile 사용 시 분위수(0~1)")
+    parser.add_argument("--random-state", type=int, default=42, help="랜덤 시드 (기본: 42)")
+    args = parser.parse_args()
+
+    # 1) 입력 로드 (파일/폴더)
+    df = load_input_path(args.path, pattern=args.pattern, recursive=(not args.no_recursive))
+    if df.empty:
+        raise ValueError("Loaded dataframe is empty. Check input path or files.")
+
+    # 2) 집계 → 스코어 → 그룹 분리
+    agg = build_uid_features(df, uid_col=args.uid_col, type_col=args.type_col,
+                             count_col=args.count_col, nick_col=args.nick_col)
+    agg = score_with_type_weight(agg, w_count=args.w_count, w_type=args.w_type)
+    if args.method == "quantile":
+        result = split_groups(agg, method=args.method, quantile=args.quantile, random_state=args.random_state)
+    else:
+        result = split_groups(agg, method=args.method, random_state=args.random_state)
+
+    # 3) 정렬 및 저장
+    result = result.sort_values(["group", "score"], ascending=[True, False])
+    result.to_csv(args.out, index=False)
+    print(f"[OK] input rows={len(df):,}, uids={result.shape[0]:,} → saved: {args.out}")
+
+
+if __name__ == "__main__":
+    main()