DS_L10N/lib/po_handler.py

"""
PO File Handler for DS_L10N
polib 기반의 안정적인 PO 파일 처리
"""
import polib
import csv
from pathlib import Path
from typing import Dict, List, Tuple, Optional
from datetime import datetime
from dataclasses import dataclass


@dataclass
class POUpdateResult:
    """PO 업데이트 결과"""
    total: int
    updated: int
    failed: int
    skipped: int
    errors: List[Tuple[str, str]]  # (msgctxt, error_message)


class POHandler:
    """PO 파일 핸들러"""

    def __init__(self, config: dict, logger):
        self.config = config
        self.logger = logger
        self.po_filename = config.get('files', {}).get('po_filename', 'LocalExport.po')

    def load_po_file(self, po_path: Path) -> Optional[polib.POFile]:
        """PO 파일 로드"""
        try:
            if not po_path.exists():
                self.logger.error(f'PO 파일을 찾을 수 없습니다: {po_path}')
                return None

            po = polib.pofile(str(po_path), encoding='utf-8')
            return po

        except Exception as e:
            self.logger.error(f'PO 파일 로드 실패: {po_path} - {e}')
            return None

    def extract_untranslated(self, po_path: Path, output_path: Path) -> int:
        """
        미번역 항목 추출

        Returns:
            추출된 항목 개수
        """
        self.logger.info(f'PO 파일 로드 중: {po_path.name}')
        po = self.load_po_file(po_path)

        if po is None:
            return 0

        # 전체 항목 및 미번역 항목 필터링
        total_entries = len([entry for entry in po if entry.msgid])  # msgid가 있는 항목만 카운트
        untranslated = [entry for entry in po if not entry.msgstr.strip()]

        if not untranslated:
            translated_count = total_entries
            self.logger.info(f'전체 {total_entries}개 항목 중 {translated_count}개 번역 완료 (100%)')
            return 0

        self.logger.info(f'전체 {total_entries}개 항목 중 미번역 {len(untranslated)}건 발견')

        # TSV 파일로 저장
        self._save_to_tsv(untranslated, output_path)

        self.logger.success(f'미번역 항목 추출 완료: {output_path}')
        return len(untranslated)

    def merge_to_csv(self, localization_root: Path, output_path: Path) -> int:
        """
        여러 언어의 PO 파일을 하나의 CSV로 병합

        Returns:
            병합된 항목 개수
        """
        self.logger.info(f'언어 폴더 탐색 중: {localization_root}')

        # 언어 폴더 찾기
        lang_folders = []
        for item in localization_root.iterdir():
            if item.is_dir():
                po_file = item / self.po_filename
                if po_file.exists():
                    lang_folders.append(item.name)

        if not lang_folders:
            self.logger.error(f'{self.po_filename} 파일을 포함하는 언어 폴더를 찾을 수 없습니다.')
            return 0

        self.logger.info(f'탐지된 언어: {", ".join(lang_folders)}')

        # 각 언어별 PO 파일 파싱
        merged_data = {}

        for lang_code in lang_folders:
            po_file_path = localization_root / lang_code / self.po_filename
            self.logger.info(f'  - {lang_code} 처리 중...')

            po = self.load_po_file(po_file_path)
            if po is None:
                continue

            for entry in po:
                # msgctxt 추출 (언리얼 해시 키)
                msgctxt = entry.msgctxt if entry.msgctxt else 'NoContext'

                # SourceLocation 추출
                source_location = entry.occurrences[0][0] if entry.occurrences else 'NoSourceLocation'

                # 줄바꿈 문자를 문자열로 치환
                msgctxt_escaped = self._escape_newlines(msgctxt)
                msgid_escaped = self._escape_newlines(entry.msgid)
                msgstr_escaped = self._escape_newlines(entry.msgstr)
                source_location_escaped = self._escape_newlines(source_location)

                # 키 생성 (msgctxt + SourceLocation)
                key = (msgctxt_escaped, source_location_escaped)

                if key not in merged_data:
                    merged_data[key] = {}

                # 언어별 번역문 저장
                merged_data[key][lang_code] = msgstr_escaped

        # CSV 레코드 생성
        records = []
        for (msgctxt, source_location), data in merged_data.items():
            record = {
                'msgctxt': msgctxt,
                'SourceLocation': source_location,
            }

            # 언어별 번역 추가
            for key, value in data.items():
                record[key] = value

            records.append(record)

        if not records:
            self.logger.error('병합할 데이터가 없습니다.')
            return 0

        # 언어 컬럼 정렬
        all_langs = set()
        for record in records:
            all_langs.update(record.keys())
        all_langs -= {'msgctxt', 'SourceLocation'}

        # 선호 순서: ko를 맨 앞에, en을 두 번째로
        source_lang = self.config.get('languages', {}).get('source', 'ko')
        preferred_order = [source_lang] + self.config.get('languages', {}).get('targets', [])

        ordered_langs = [lang for lang in preferred_order if lang in all_langs]
        other_langs = sorted([lang for lang in all_langs if lang not in preferred_order])
        final_langs = ordered_langs + other_langs

        # CSV 저장
        output_path.parent.mkdir(parents=True, exist_ok=True)

        with open(output_path, 'w', newline='', encoding='utf-8-sig') as f:
            fieldnames = ['msgctxt', 'SourceLocation'] + final_langs
            writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
            writer.writeheader()
            writer.writerows(records)

        self.logger.success(f'CSV 병합 완료: {output_path}')
        self.logger.info(f'총 {len(records)}개 항목, {len(final_langs)}개 언어')

        return len(records)

    def update_from_tsv(self, tsv_path: Path, localization_root: Path,
                        backup: bool = True, dry_run: bool = False) -> Dict[str, POUpdateResult]:
        """
        TSV 파일로 PO 파일 업데이트 (polib 사용)

        Args:
            tsv_path: 번역 TSV 파일 경로
            localization_root: 언어 폴더들의 루트
            backup: 백업 생성 여부
            dry_run: 실제 파일 수정 없이 시뮬레이션

        Returns:
            언어별 업데이트 결과
        """
        self.logger.info(f'TSV 파일 로드 중: {tsv_path}')

        # TSV 파일 읽기
        translations_by_lang = self._load_tsv(tsv_path)

        if not translations_by_lang:
            self.logger.error('TSV 파일에서 번역 데이터를 읽을 수 없습니다.')
            return {}

        self.logger.info(f'업데이트 대상 언어: {", ".join(translations_by_lang.keys())}')

        results = {}

        # 언어별로 PO 파일 업데이트
        for lang_code, translations in translations_by_lang.items():
            self.logger.info(f'\n언어 처리 중: {lang_code}')

            lang_folder = localization_root / lang_code
            if not lang_folder.is_dir():
                self.logger.warning(f'  언어 폴더를 찾을 수 없습니다: {lang_folder}')
                continue

            po_path = lang_folder / self.po_filename
            if not po_path.exists():
                self.logger.warning(f'  PO 파일을 찾을 수 없습니다: {po_path}')
                continue

            # PO 파일 업데이트
            result = self._update_po_file(po_path, translations, backup, dry_run)
            results[lang_code] = result

            # 결과 출력
            self._print_update_result(lang_code, result)

        return results

    def _update_po_file(self, po_path: Path, translations: Dict[str, str],
                        backup: bool, dry_run: bool) -> POUpdateResult:
        """단일 PO 파일 업데이트"""
        result = POUpdateResult(
            total=len(translations),
            updated=0,
            failed=0,
            skipped=0,
            errors=[]
        )

        # 백업 생성
        if backup and not dry_run:
            backup_path = self._create_backup(po_path)
            if backup_path:
                self.logger.info(f'  백업 생성: {backup_path.name}')

        # PO 파일 로드
        po = self.load_po_file(po_path)
        if po is None:
            result.failed = result.total
            result.errors.append(('ALL', 'PO 파일 로드 실패'))
            return result

        # msgctxt로 인덱싱
        po_index = {}
        for entry in po:
            if entry.msgctxt:
                po_index[entry.msgctxt] = entry

        # 번역문 업데이트
        for msgctxt, new_msgstr in translations.items():
            if msgctxt not in po_index:
                result.failed += 1
                result.errors.append((msgctxt, 'PO 파일에서 msgctxt를 찾을 수 없음'))
                continue

            entry = po_index[msgctxt]
            current_msgstr = entry.msgstr

            # 변경사항 없으면 스킵
            if current_msgstr == new_msgstr:
                result.skipped += 1
                continue

            # msgstr 업데이트
            if not dry_run:
                entry.msgstr = new_msgstr

            result.updated += 1

        # 파일 저장
        if not dry_run and result.updated > 0:
            try:
                po.save(str(po_path))
            except Exception as e:
                self.logger.error(f'  PO 파일 저장 실패: {e}')
                result.errors.append(('SAVE', str(e)))

        return result

    def _load_tsv(self, tsv_path: Path) -> Dict[str, Dict[str, str]]:
        """TSV 파일 로드"""
        translations_by_lang = {}

        try:
            with open(tsv_path, 'r', encoding='utf-8-sig') as f:
                reader = csv.DictReader(f, delimiter='\t')

                # 컬럼 확인
                if not reader.fieldnames or len(reader.fieldnames) <= 1:
                    self.logger.error('TSV 파일이 탭(tab)으로 구분되지 않았습니다.')
                    return {}

                # 제외할 컬럼
                exclude_columns = {'msgctxt', 'SourceLocation', 'msgid'}
                lang_codes = [col for col in reader.fieldnames if col not in exclude_columns]

                # 언어별 딕셔너리 초기화
                for lang in lang_codes:
                    translations_by_lang[lang] = {}

                # 행 읽기
                for row in reader:
                    msgctxt = row.get('msgctxt')
                    if not msgctxt:
                        continue

                    for lang in lang_codes:
                        msgstr = row.get(lang, '')
                        if msgstr:  # 빈 문자열이 아니면 저장
                            # TSV의 이스케이프된 줄바꿈을 실제 escape sequence로 변환
                            msgstr = self._unescape_newlines(msgstr)
                            translations_by_lang[lang][msgctxt] = msgstr

        except Exception as e:
            self.logger.error(f'TSV 파일 읽기 실패: {e}')
            return {}

        return translations_by_lang

    def _save_to_tsv(self, entries: List[polib.POEntry], output_path: Path):
        """POEntry 리스트를 TSV로 저장"""
        output_path.parent.mkdir(parents=True, exist_ok=True)

        with open(output_path, 'w', newline='', encoding='utf-8-sig') as f:
            writer = csv.writer(f, delimiter='\t', quoting=csv.QUOTE_ALL)

            # 헤더
            writer.writerow(['msgctxt', 'SourceLocation', 'msgid'])

            # 데이터
            for entry in entries:
                msgctxt = self._escape_newlines(entry.msgctxt or '')
                source_location = entry.occurrences[0][0] if entry.occurrences else ''
                msgid = self._escape_newlines(entry.msgid)

                writer.writerow([msgctxt, source_location, msgid])

    def _escape_newlines(self, text: str) -> str:
        """줄바꿈 문자를 문자열로 치환"""
        return text.replace('\r', '\\r').replace('\n', '\\n')

    def _unescape_newlines(self, text: str) -> str:
        """문자열 줄바꿈을 실제 문자로 변환"""
        return text.replace('\\r', '\r').replace('\\n', '\n')

    def _create_backup(self, po_path: Path) -> Optional[Path]:
        """백업 파일 생성"""
        try:
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            backup_path = po_path.with_suffix(f'.backup_{timestamp}.po')
            backup_path.write_bytes(po_path.read_bytes())
            return backup_path
        except Exception as e:
            self.logger.warning(f'백업 생성 실패: {e}')
            return None

    def _print_update_result(self, lang_code: str, result: POUpdateResult):
        """업데이트 결과 출력"""
        if result.updated > 0:
            self.logger.success(f'  ✅ {lang_code}: {result.updated}건 업데이트')

        if result.skipped > 0:
            self.logger.info(f'  ⏭️  {lang_code}: {result.skipped}건 스킵 (변경사항 없음)')

        if result.failed > 0:
            self.logger.error(f'  ❌ {lang_code}: {result.failed}건 실패')

            # 실패 이유 출력 (최대 5개)
            for msgctxt, error in result.errors[:5]:
                self.logger.error(f'     - {msgctxt}: {error}')

            if len(result.errors) > 5:
                self.logger.error(f'     ... 외 {len(result.errors) - 5}건 더 있음')