From 2765844650c3f2360890751c537a04fb2c4c38d4 Mon Sep 17 00:00:00 2001 From: Langley Date: Tue, 19 Aug 2025 12:31:04 +0900 Subject: [PATCH] =?UTF-8?q?=EB=B2=84=EA=B7=B8=20=EC=88=98=EC=A0=95=20?= =?UTF-8?q?=EB=B0=8F=20=ED=95=B4=EC=BB=A4=20=EA=B7=B8=EB=A3=B9=20=EB=B6=84?= =?UTF-8?q?=EB=A6=AC=20=EC=BD=94=EB=93=9C=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 +- README.md | 12 ++++ crawler.py | 28 ++++++++- requirements.txt | 4 ++ score.py | 155 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 198 insertions(+), 3 deletions(-) create mode 100644 README.md create mode 100644 requirements.txt create mode 100644 score.py diff --git a/.gitignore b/.gitignore index ec7f06b..744ad57 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ /__pycache__/ -*.txt +checkpoint.txt *.csv \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..fb2d096 --- /dev/null +++ b/README.md @@ -0,0 +1,12 @@ +1. 데이터를 수집 +다음 명령을 실행 +`python crawler.py` + +주의점 : 성능 문제로 한번에 모든 데이터를 수집하지 않는다. 더이상 데이터가 수집되지 않을때까지 반복실행하면 여러개의 csv 가 생성된다. + +2. 해커 분류 +다음 명령을 실행 +`python score.py 파일|폴더` + +수집된 csv 파일을 인자로 주면 해당 파일을 분석하여 결과를 hack_group.csv 에 저장한다 +폴더를 입력하면 폴더안에 있는 모든 csv 를 고려하여 결과를 분석한다 \ No newline at end of file diff --git a/crawler.py b/crawler.py index 3d5a395..5349704 100644 --- a/crawler.py +++ b/crawler.py @@ -444,6 +444,29 @@ def find_skill_uses_for_validation( return skill_use_set +def collapse_duplicated(data): + # 그룹핑 key: uid + timestamp + xyz + bef_xyz + def make_key(item): + return ( + item["uid"], + item["@timestamp"], + tuple(item["body"]["xyz"]), + tuple(item["body"]["bef_xyz"]) + ) + + grouped = defaultdict(list) + for d in data: + grouped[make_key(d)].append(d) + + collapsed = [] + for _, items in grouped.items(): + base = items[0].copy() + base["duplicated"] = len(items) + collapsed.append(base) + + return collapsed + + # ========================= # 2) 저장 전용 (분석 없음) # ========================= @@ -469,10 +492,11 @@ def save_verified_batches_to_csv( for verified_hack_logs in verified_batches_iter: summary_data = defaultdict(lambda: defaultdict(int)) - for log in verified_hack_logs: + collapsed = collapse_duplicated(verified_hack_logs) + for log in collapsed: uid = log.get('uid') if uid: - hack_type = len(log) + hack_type = log.get('duplicated') summary_data[uid][hack_type] += 1 uids_to_lookup = {log['uid'] for log in verified_hack_logs if 'uid' in log} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3a2f623 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +opensearch-py>=2.6.0 +requests>=2.31.0 +urllib3>=1.26.0 +scikit-learn \ No newline at end of file diff --git a/score.py b/score.py new file mode 100644 index 0000000..fd33e38 --- /dev/null +++ b/score.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +import os +import glob +import argparse +from collections import Counter + +import pandas as pd +import numpy as np +from sklearn.preprocessing import StandardScaler +from sklearn.mixture import GaussianMixture +from sklearn.cluster import KMeans +from sklearn.svm import OneClassSVM + + +# ----------------------------- +# 기존 로직 (수정/보완 없이 그대로 사용) +# ----------------------------- +def most_frequent(series): + vals = [v for v in series if pd.notna(v)] + return Counter(vals).most_common(1)[0][0] if vals else np.nan + +def build_uid_features(df, uid_col="uid", type_col="type", count_col="count", nick_col="nickname"): + """uid별 집계 피처 생성""" + agg = ( + df.groupby(uid_col, as_index=False) + .agg( + nickname=(nick_col, most_frequent) if nick_col in df.columns else (uid_col, "size"), + total_count=(count_col, "sum"), + unique_types=(type_col, "nunique"), + rows=(uid_col, "size") + ) + ) + return agg + +def score_with_type_weight(agg, w_count=1.0, w_type=1.5): + """z-정규화 후 가중합 스코어""" + feats = agg[["total_count", "unique_types"]].values + scaler = StandardScaler() + Z = scaler.fit_transform(feats) + score = w_count * Z[:, 0] + w_type * Z[:, 1] * agg["unique_types"].to_numpy() + agg = agg.copy() + agg["score"] = score + return agg + +def split_groups(agg, method="gmm", quantile=0.5, random_state=42): + """ + method: + - 'gmm' : GaussianMixture 2컴포넌트로 HIGH/LOW + - 'kmeans' : KMeans(k=2)로 클러스터링 + - 'quantile' : score 기준 분위수로 분리(기본: 중위수) + - 'oneclass' : One-Class SVM으로 'HIGH'를 다수/정상군으로, 나머지 'LOW' + """ + agg = agg.copy() + s = agg["score"].to_numpy().reshape(-1, 1) + + if method == "gmm": + gmm = GaussianMixture(n_components=2, random_state=random_state) + labels = gmm.fit_predict(s) + means = gmm.means_.flatten() + high_component = int(np.argmax(means)) + agg["group"] = np.where(labels == high_component, "HIGH", "LOW") + + elif method == "kmeans": + km = KMeans(n_clusters=2, n_init="auto", random_state=random_state) + labels = km.fit_predict(s) + centers = km.cluster_centers_.flatten() + high_component = int(np.argmax(centers)) + agg["group"] = np.where(labels == high_component, "HIGH", "LOW") + + elif method == "quantile": + thr = np.quantile(agg["score"], quantile) + agg["group"] = np.where(agg["score"] >= thr, "HIGH", "LOW") + + elif method == "oneclass": + oc = OneClassSVM(kernel="rbf", gamma="scale", nu=0.1) # 필요 시 nu 조정 + oc.fit(s) + pred = oc.predict(s) # 1: inliers, -1: outliers + agg["group"] = np.where(pred == 1, "HIGH", "LOW") + + else: + raise ValueError("method must be one of: gmm, kmeans, quantile, oneclass") + + return agg + + +# ----------------------------- +# 추가: 입력 경로(파일/폴더) 처리 유틸 +# ----------------------------- +def load_input_path(path: str, pattern: str = "*.csv", recursive: bool = True) -> pd.DataFrame: + """ + - path가 파일이면 해당 CSV만 로드 + - path가 폴더이면 내부의 CSV들을 전부 로드하여 concat + """ + if os.path.isdir(path): + glob_pattern = os.path.join(path, "**", pattern) if recursive else os.path.join(path, pattern) + files = sorted(glob.glob(glob_pattern, recursive=recursive)) + if not files: + raise FileNotFoundError(f"No CSV files found under directory: {path}") + dfs = [] + for f in files: + try: + dfs.append(pd.read_csv(f)) + except Exception as e: + raise RuntimeError(f"Failed to read '{f}': {e}") + df = pd.concat(dfs, ignore_index=True) + else: + if not os.path.exists(path): + raise FileNotFoundError(f"Path not found: {path}") + df = pd.read_csv(path) + return df + + +# ----------------------------- +# 실행 엔트리포인트 +# ----------------------------- +def main(): + parser = argparse.ArgumentParser(description="UID 그룹 스코어링 및 분리 (파일 또는 폴더 입력)") + parser.add_argument("path", help="입력 CSV 파일 경로 또는 폴더 경로") + parser.add_argument("-o", "--out", default="hack_group.csv", help="출력 CSV 파일명 (기본: hack_group.csv)") + parser.add_argument("--pattern", default="*.csv", help="폴더 입력 시 읽을 파일 패턴 (기본: *.csv)") + parser.add_argument("--no-recursive", action="store_true", help="폴더 입력 시 하위 디렉토리 재귀 탐색 비활성화") + parser.add_argument("--uid-col", default="uid", help="UID 컬럼명 (기본: uid)") + parser.add_argument("--type-col", default="type", help="TYPE 컬럼명 (기본: type)") + parser.add_argument("--count-col", default="count", help="COUNT 컬럼명 (기본: count)") + parser.add_argument("--nick-col", default="nickname", help="닉네임 컬럼명 (기본: nickname)") + parser.add_argument("--w-count", type=float, default=1.0, help="스코어 가중치: count (기본: 1.0)") + parser.add_argument("--w-type", type=float, default=1.5, help="스코어 가중치: unique_types (기본: 1.5)") + parser.add_argument("--method", choices=["gmm", "kmeans", "quantile", "oneclass"], default="gmm", + help="그룹 분리 방법 (기본: gmm)") + parser.add_argument("--quantile", type=float, default=0.5, help="method=quantile 사용 시 분위수(0~1)") + parser.add_argument("--random-state", type=int, default=42, help="랜덤 시드 (기본: 42)") + args = parser.parse_args() + + # 1) 입력 로드 (파일/폴더) + df = load_input_path(args.path, pattern=args.pattern, recursive=(not args.no_recursive)) + if df.empty: + raise ValueError("Loaded dataframe is empty. Check input path or files.") + + # 2) 집계 → 스코어 → 그룹 분리 + agg = build_uid_features(df, uid_col=args.uid_col, type_col=args.type_col, + count_col=args.count_col, nick_col=args.nick_col) + agg = score_with_type_weight(agg, w_count=args.w_count, w_type=args.w_type) + if args.method == "quantile": + result = split_groups(agg, method=args.method, quantile=args.quantile, random_state=args.random_state) + else: + result = split_groups(agg, method=args.method, random_state=args.random_state) + + # 3) 정렬 및 저장 + result = result.sort_values(["group", "score"], ascending=[True, False]) + result.to_csv(args.out, index=False) + print(f"[OK] input rows={len(df):,}, uids={result.shape[0]:,} → saved: {args.out}") + + +if __name__ == "__main__": + main()