#!/usr/bin/env python3
"""Telegram bot: fetches tech news from RSS feeds, summarizes with Claude, sends to Telegram."""
import json, os, re, time, hashlib, sqlite3
from datetime import datetime, timedelta, timezone
import feedparser, requests
from openai import OpenAI

DIR = os.path.dirname(os.path.abspath(__file__))
CFG = json.load(open(os.path.join(DIR, 'config.json')))
DB = os.path.join(DIR, 'sent.db')
BOT = f"https://api.telegram.org/bot{CFG['telegram_bot_token']}"
KST = timezone(timedelta(hours=9))
LLM = OpenAI(base_url='https://openrouter.ai/api/v1', api_key=CFG['openrouter_api_key'])
LLM_MODEL = 'deepseek/deepseek-chat-v3-0324'


def init_db():
    conn = sqlite3.connect(DB)
    conn.execute('CREATE TABLE IF NOT EXISTS sent (hash TEXT PRIMARY KEY, ts INTEGER)')
    conn.execute('DELETE FROM sent WHERE ts < ?', (int(time.time()) - 7*86400,))
    conn.commit()
    return conn


def was_sent(conn, url):
    h = hashlib.md5(url.encode()).hexdigest()
    return conn.execute('SELECT 1 FROM sent WHERE hash=?', (h,)).fetchone() is not None


def mark_sent(conn, url):
    h = hashlib.md5(url.encode()).hexdigest()
    conn.execute('INSERT OR IGNORE INTO sent VALUES (?,?)', (h, int(time.time())))
    conn.commit()


def get_chat_id():
    if CFG.get('chat_id'):
        return CFG['chat_id']
    r = requests.get(f"{BOT}/getUpdates", timeout=10).json()
    for u in r.get('result', []):
        msg = u.get('message', {})
        if msg.get('text', '').startswith('/start'):
            cid = msg['chat']['id']
            CFG['chat_id'] = cid
            with open(os.path.join(DIR, 'config.json'), 'w') as f:
                json.dump(CFG, f, indent=2)
            return cid
    return None


def shorten_url(url):
    for attempt in range(5):
        try:
            r = requests.post('https://buly.kr/lib/control.siso',
                data={'CODE': '', 'dbControl': 'setTargetLinkToShotLink', 'v': url},
                headers={'Referer': 'https://buly.kr/', 'User-Agent': 'Mozilla/5.0'},
                timeout=10)
            data = r.json()
            if data.get('result') == 'Y' and data.get('key'):
                return f"https://buly.kr/{data['key']}"
        except Exception:
            pass
        time.sleep(3)
    return url


def send_msg(chat_id, text):
    r = requests.post(f"{BOT}/sendMessage", json={
        'chat_id': chat_id,
        'text': text,
        'parse_mode': 'HTML',
        'disable_web_page_preview': True
    }, timeout=15)
    return r.json()


def fetch_news():
    cutoff = datetime.now(timezone.utc) - timedelta(hours=CFG.get('schedule_hours', 24))
    articles = []
    for feed_cfg in CFG['feeds']:
        try:
            feed = feedparser.parse(feed_cfg['url'])
            for entry in feed.entries[:CFG.get('max_articles_per_feed', 15)]:
                pub = entry.get('published_parsed') or entry.get('updated_parsed')
                if pub:
                    dt = datetime(*pub[:6], tzinfo=timezone.utc)
                    if dt < cutoff:
                        continue
                else:
                    dt = datetime.now(timezone.utc)

                desc = entry.get('description', '')
                desc = re.sub(r'<!\[CDATA\[|]]>', '', desc)
                desc = re.sub(r'<[^>]+>', '', desc).strip()

                articles.append({
                    'source': feed_cfg['name'],
                    'icon': feed_cfg.get('icon', ''),
                    'title': entry.get('title', ''),
                    'link': entry.get('link', ''),
                    'desc': desc[:500],
                    'date': dt,
                })
        except Exception as e:
            print(f"  Error fetching {feed_cfg['name']}: {e}")
    articles.sort(key=lambda a: a['date'], reverse=True)
    return articles


def summarize_batch(article_texts):
    """Summarize a batch of articles."""

    prompt = f"""[시스템 지시] 반드시 한국어로만 출력해. 중국어(漢字), 일본어, 아랍어, 영어 문장은 절대 포함하지 마. 고유명사만 영어 허용.

아래 영어 뉴스 기사들을 한국어로 상세히 요약해줘.

출력 형식 (각 기사마다 정확히 이 형식):
---
한국어 제목 (한 문장, 이모티콘 없이)
-첫 번째 핵심 사실을 2~3줄 분량으로 상세히 서술
-두 번째 핵심 사실을 2~3줄 분량으로 상세히 서술
-세 번째 핵심 사실을 2~3줄 분량으로 상세히 서술
URL: 원본링크
COVERS: 원본기사번호들 (예: 1,3,7)
---

아래는 좋은 출력 예시야:

---
애플, 2027년 출시 예정 스마트 안경의 4가지 디자인을 테스트 중
-블룸버그 마크 거먼에 따르면 애플은 큰 사각형·슬림 사각형(팀 쿡 착용 스타일)·큰 원형/타원형·작은 원형/타원형 등 4가지 프레임 디자인과 블랙·오션블루·라이트브라운 등 다양한 색상을 테스트 중
-이 안경은 디스플레이가 없으며 사진·영상 촬영, 전화 응답, 음악 재생, 업그레이드된 Siri와의 상호작용 등 Meta Ray-Ban 안경에 가까운 기능을 제공할 예정
-Vision Pro의 부진한 반응 이후 AR/VR 복합기기 전략에서 선회한 것으로, 올해 말 공개 후 2027년 판매 개시가 목표
URL: https://example.com/article
COVERS: 1,5
---

[필수 규칙]
1. 문장은 반드시 한국어로 작성. 단, 고유명사(기업명·제품명·인명·기술명)는 영어 원문 그대로 사용 가능 (예: Apple, OpenAI, Claude, Meta Ray-Ban, Vision Pro, TSMC 등)
2. 일반 영어 단어는 반드시 한국어로 번역 (예: announced→발표함, partnership→파트너십/제휴, according to→~에 따르면)
3. 글머리 기호(-) 문장은 음슴체로 작성 (~중, ~예정, ~목표, ~했음, ~알려짐, ~밝힘 등). 문장 끝에 마침표 절대 찍지 마
4. 각 글머리 기호는 구체적 수치·인명·배경까지 포함해 2~3줄 분량으로 상세히 서술. 짧게 쓰지 마
5. 정렬 (매우 중요, 반드시 지켜). 아래 순서대로 기사를 배치해:
  [1구간] 상장 기업 관련 뉴스를 시가총액 높은 순서에서 낮은 순서로 배치. 시가총액 참고: Apple 3.5조$ > Nvidia 2.8조$ > Microsoft 2.7조$ > Google 2조$ > Amazon 1.9조$ > Meta 1.5조$ > TSMC 8000억$ > Tesla 7000억$ > ...
  [2구간] 비상장 기업 관련 뉴스를 장외 기업가치 높은 순서에서 낮은 순서로 배치. 기업가치 참고: SpaceX 3500억$ > OpenAI 3000억$ > Anthropic 610억$ > CoreWeave 350억$ > ...
  [3구간] 위 기업 뉴스가 모두 끝난 후, 정부·정치·규제·글로벌 사회 뉴스를 마지막에 배치
6. 중복 기사는 하나로 합침 (URL은 가장 상세한 기사 것 사용)
7. 모든 기사 번호가 COVERS에 포함되어야 함. 기사를 빠뜨리지 마
8. 중복이 아닌 기사는 반드시 각각 별도 항목으로 출력
9. 기사 요약 외에 다른 말(인사, 설명, 코멘트, 번역 참고사항 등)은 절대 출력하지 마. 오직 ---로 구분된 기사 요약만 출력해

기사 목록:
{chr(10).join(article_texts)}"""

    response = LLM.chat.completions.create(
        model=LLM_MODEL,
        max_tokens=6000,
        messages=[{"role": "user", "content": prompt}]
    )
    text = response.choices[0].message.content

    # 2차 처리: 코드 기반 비한글→한글 치환 (토큰 안 씀)
    KO_MAP = {
        # 유명 상장사
        'Apple': '애플', 'Google': '구글', 'Alphabet': '알파벳', 'Amazon': '아마존',
        'Microsoft': '마이크로소프트', 'Nvidia': '엔비디아', 'NVIDIA': '엔비디아',
        'Tesla': '테슬라', 'Samsung': '삼성', 'Netflix': '넷플릭스', 'Intel': '인텔',
        'Qualcomm': '퀄컴', 'Alibaba': '알리바바', 'Palantir': '팔란티어',
        'Baidu': '바이두',
        # 기업/제품
        'Meta': '메타', 'OpenAI': '오픈AI', 'Anthropic': '앤스로픽',
        'DeepSeek': '딥시크', 'CoreWeave': '코어위브', 'Spotify': '스포티파이',
        'TikTok': '틱톡', 'ByteDance': '바이트댄스', 'Uber': '우버',
        'TSMC': '티에스엠씨', 'AMD': '에이엠디', 'SiFive': '사이파이브',
        'Claude': '클로드', 'iPhone': '아이폰', 'iPad': '아이패드', 'MacBook': '맥북',
        'Vision Pro': '비전 프로', 'Ray-Ban': '레이밴', 'Siri': '시리',
        'Bloomberg': '블룸버그', 'Reuters': '로이터', 'Polymarket': '폴리마켓',
        'Kalshi': '칼시', 'Rapidus': '라피더스', 'Codex': '코덱스',
        'ChatGPT': '챗지피티', 'Gemini': '제미나이', 'Copilot': '코파일럿',
        'WhatsApp': '왓츠앱', 'Instagram': '인스타그램', 'YouTube': '유튜브',
        'Twitter': '트위터', 'Slack': '슬랙', 'Discord': '디스코드',
        'MarketWatch': '마켓워치', 'The Verge': '더 버지', 'TechCrunch': '테크크런치',
        'CNBC': '씨엔비씨', 'Wall Street Journal': '월스트리트저널',
        # 인명
        'Elon Musk': '일론 머스크', 'Tim Cook': '팀 쿡', 'Sam Altman': '샘 올트먼',
        'Mark Zuckerberg': '마크 저커버그', 'Jensen Huang': '젠슨 황',
        'Satya Nadella': '사티아 나델라', 'Sundar Pichai': '순다르 피차이',
        'Mark Gurman': '마크 거먼', 'Dario Amodei': '다리오 아모데이',
        'Trump': '트럼프', 'Powell': '파월', 'Musk': '머스크',
        # 기술 약어 (공백/조사 패턴)
        'AR/VR': '에이알/브이알', 'LLM': '대규모언어모델',
    }

    # URL/COVERS 줄은 보호하면서 치환
    lines = text.split('\n')
    for i, line in enumerate(lines):
        stripped = line.strip()
        if stripped.startswith('URL:') or stripped.startswith('COVERS:') or stripped == '---' or stripped.startswith('http'):
            continue
        for eng, kor in KO_MAP.items():
            lines[i] = lines[i].replace(eng, kor)
        # 한자(CJK Unified Ideographs) 제거 — 한글 음차로 대체 불가하므로 삭제
        lines[i] = re.sub(r'[\u4e00-\u9fff\u3400-\u4dbf]+', '', lines[i])
        # 일본어 히라가나/가타카나 제거
        lines[i] = re.sub(r'[\u3040-\u309f\u30a0-\u30ff]+', '', lines[i])
    text = '\n'.join(lines)

    return text


def summarize_with_claude(articles):
    """Summarize all articles in batches of 5."""
    BATCH = 10
    all_texts = []
    for i, a in enumerate(articles):
        all_texts.append(f"[{i+1}] SOURCE: {a['source']}\nTITLE: {a['title']}\nDESC: {a['desc'][:200]}\nURL: {a['link']}")

    results = []
    for start in range(0, len(all_texts), BATCH):
        batch = all_texts[start:start + BATCH]
        print(f"  Batch {start//BATCH + 1}: articles {start+1}-{start+len(batch)}")
        results.append(summarize_batch(batch))
        time.sleep(3)
    merged = '\n'.join(results)

    # 배치 간 중복 제거: 제목 유사도 기반 코드 처리
    if len(results) > 1:
        print("  Deduplicating across batches...")
        blocks = re.split(r'\n---\n', merged)
        seen_titles = []
        unique_blocks = []
        for block in blocks:
            block = block.strip()
            if not block:
                continue
            # 첫 줄을 제목으로 추출
            title_line = block.split('\n')[0].strip().replace('-', '')
            # 제목에서 핵심 키워드 추출 (한글+숫자)
            keywords = set(re.findall(r'[가-힣]+|[0-9]+', title_line))
            # 기존 제목과 50% 이상 키워드 겹치면 중복으로 판단
            is_dup = False
            for prev_kw in seen_titles:
                if len(keywords) > 0 and len(prev_kw) > 0:
                    overlap = len(keywords & prev_kw) / min(len(keywords), len(prev_kw))
                    if overlap >= 0.5:
                        is_dup = True
                        break
            if not is_dup:
                unique_blocks.append(block)
                seen_titles.append(keywords)

        deduped = len(blocks) - len(unique_blocks)
        if deduped > 0:
            print(f"  Removed {deduped} duplicate(s)")
        merged = '\n---\n'.join(unique_blocks)

    return merged


def build_telegram_messages(claude_output, articles):
    """Parse Claude's output and add shortened URLs, build Telegram messages."""
    now = datetime.now(KST).strftime('%Y년 %m월 %d일 오전 7시')
    header = f"<b>오늘의 테크 뉴스</b>\n{now}\n{'─'*28}\n"

    # First pass: collect all URLs from Claude output and shorten them all
    all_urls = re.findall(r'https?://\S+', claude_output)
    url_map = {}
    for url in all_urls:
        url = url.rstrip(')').rstrip('.')  # clean trailing chars
        if url not in url_map:
            url_map[url] = shorten_url(url)
            time.sleep(1)

    messages = []
    current = header

    # 유효한 줄만 필터링: 제목(한글 시작), 글머리(-), URL만 허용
    for line in claude_output.strip().split('\n'):
        line = line.strip()
        if not line or line == '---':
            continue

        if line.startswith('SECTION:') or line.startswith('COVERS:'):
            continue
        elif line.startswith('URL:') or line.startswith('http://') or line.startswith('https://'):
            orig_url = line.replace('URL:', '').strip()
            short_url = url_map.get(orig_url)
            if not short_url:
                # url_map에 없으면 즉석 축약 시도
                short_url = shorten_url(orig_url)
                time.sleep(1)
            formatted = f"{short_url}\n"
        elif line.startswith('-') and re.search(r'[가-힣]', line):
            formatted = f"{line}\n"
        elif not line.startswith('-') and re.search(r'[가-힣]', line):
            # 제목 줄: 한글이 포함된 경우만 허용
            formatted = f"\n{line}\n"
        else:
            # 한글 없는 줄은 스킵
            continue

        if len(current) + len(formatted) > 4000:
            messages.append(current)
            current = ""
        current += formatted

    if current.strip():
        messages.append(current)

    # 최종 검증: 축약 안 된 URL과 영어 문장 제거
    final = []
    for msg in messages:
        # 축약 안 된 긴 URL을 축약 시도
        for long_url in re.findall(r'https?://(?!buly\.kr/)\S+', msg):
            long_url = long_url.rstrip(').')
            short = shorten_url(long_url)
            time.sleep(1)
            if short != long_url:
                msg = msg.replace(long_url, short)
        # 영어로만 된 줄 제거 + 고립된 URL 제거
        cleaned_lines = []
        for line in msg.split('\n'):
            stripped = line.strip()
            if not stripped:
                cleaned_lines.append(line)
            elif stripped.startswith('<b>') or stripped.startswith('─'):
                # 헤더 유지
                cleaned_lines.append(line)
            elif stripped.startswith('http') or stripped.startswith('https://buly'):
                # URL은 바로 위에 한글 내용이 있을 때만 유지
                if cleaned_lines and re.search(r'[가-힣]', cleaned_lines[-1]):
                    cleaned_lines.append(line)
            elif re.search(r'[가-힣]', stripped):
                cleaned_lines.append(line)
            # 한글 없는 줄은 버림
        msg = '\n'.join(cleaned_lines)
        if msg.strip():
            final.append(msg)
    return final


def fetch_polymarket():
    """Fetch Polymarket prediction data."""
    markets = CFG.get('polymarket', [])
    if not markets:
        return ""
    lines = ["\n{'─'*28}", "<b>Polymarket 예측 시장</b>\n"]
    for m in markets:
        try:
            r = requests.get(f"https://gamma-api.polymarket.com/events?slug={m['slug']}",
                headers={'User-Agent': 'Mozilla/5.0'}, timeout=10)
            data = r.json()
            if data:
                event = data[0]
                mkt = event['markets'][0]
                prices = json.loads(mkt['outcomePrices'])
                yes_pct = float(prices[0]) * 100
                vol = float(event.get('volume', 0))
                title = m.get('label', event['title'])
                lines.append(f"{title}: <b>{yes_pct:.1f}%</b> (거래량 ${vol:,.0f})")
        except Exception as e:
            print(f"  Polymarket error ({m['slug']}): {e}")
    if len(lines) <= 2:
        return ""
    return "\n".join(lines) + "\n"


def main():
    print(f"[{datetime.now()}] Starting news digest...")

    chat_id = get_chat_id()
    if not chat_id:
        print("ERROR: No chat_id. Send /start to @SjLab_bot first!")
        return

    conn = init_db()
    articles = fetch_news()
    new_articles = [a for a in articles if not was_sent(conn, a['link'])]

    print(f"  Total: {len(articles)}, New: {len(new_articles)}")

    if not new_articles:
        print("  No new articles to send.")
        return

    # Summarization in batches
    print("  Summarizing...")
    claude_output = summarize_with_claude(new_articles)
    print(f"  Summary: {len(claude_output)} chars")

    # Build and send messages
    messages = build_telegram_messages(claude_output, new_articles)

    # Validation 1: check all articles covered via COVERS lines
    covered = set()
    for m in re.findall(r'COVERS:\s*([0-9,\s]+)', claude_output):
        for n in re.findall(r'\d+', m):
            covered.add(int(n))
    all_nums = set(range(1, len(new_articles) + 1))
    missing_nums = all_nums - covered
    if missing_nums:
        print(f"  WARNING: {len(missing_nums)} article(s) not covered: {sorted(missing_nums)}")
        extra_lines = []
        for n in sorted(missing_nums):
            a = new_articles[n - 1]
            short = shorten_url(a['link'])
            time.sleep(1)
            extra_lines.append(f"\n{a['title']}\n- {a['desc'][:150]}\n{short}\n")
        extra = "\n".join(extra_lines)
        # Split into 4000 char chunks if needed
        while extra:
            chunk = extra[:3900]
            if messages and len(messages[-1]) + len(chunk) < 4000:
                messages[-1] += chunk
            else:
                messages.append(chunk)
            extra = extra[3900:]

    # Validation 2: verify all links are shortened
    unshortened = []
    for msg in messages:
        for url in re.findall(r'https?://\S+', msg):
            url = url.rstrip(').').rstrip('"').rstrip("'").rstrip('>')
            if not url.startswith('https://buly.kr/'):
                unshortened.append(url)
    if unshortened:
        print(f"  WARNING: {len(unshortened)} unshortened URL(s), fixing...")
        for orig in set(unshortened):
            short = shorten_url(orig)
            time.sleep(1)
            if short != orig:
                messages = [m.replace(orig, short) for m in messages]

    print(f"  Covered {len(covered)}/{len(new_articles)} articles, {len(unshortened)} URLs fixed")

    for msg in messages:
        result = send_msg(chat_id, msg)
        if result.get('ok'):
            print(f"  Sent message ({len(msg)} chars)")
        else:
            # 429 에러 시 대기 후 재시도
            retry_after = result.get('parameters', {}).get('retry_after', 10)
            print(f"  Rate limited, waiting {retry_after}s...")
            time.sleep(retry_after + 1)
            result = send_msg(chat_id, msg)
            if result.get('ok'):
                print(f"  Sent message on retry ({len(msg)} chars)")
            else:
                print(f"  Send failed: {result}")
        time.sleep(3)

    for a in new_articles:
        mark_sent(conn, a['link'])

    conn.close()
    print(f"  Done! Sent {len(new_articles)} articles.")


if __name__ == '__main__':
    main()