#!/usr/bin/env python3
"""네이버 블로그 rob_ust 전체 글 크롤링"""

import json, time, re, os, sys
from urllib.request import Request, urlopen
from urllib.parse import unquote_plus
from html.parser import HTMLParser

BLOG_ID = "rob_ust"
OUT_DIR = "/home/admin/apps/apt_db/data/blog_posts"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Referer": f"https://blog.naver.com/{BLOG_ID}",
}

os.makedirs(OUT_DIR, exist_ok=True)


class TextExtractor(HTMLParser):
    """Simple HTML to text converter"""
    def __init__(self):
        super().__init__()
        self.texts = []
        self._skip = False

    def handle_starttag(self, tag, attrs):
        if tag in ('script', 'style', 'noscript'):
            self._skip = True
        if tag == 'br' or tag == 'p' or tag == 'div':
            self.texts.append('\n')

    def handle_endtag(self, tag):
        if tag in ('script', 'style', 'noscript'):
            self._skip = False

    def handle_data(self, data):
        if not self._skip:
            self.texts.append(data)

    def get_text(self):
        return ''.join(self.texts)


def fetch(url):
    req = Request(url)
    for k, v in HEADERS.items():
        req.add_header(k, v)
    with urlopen(req, timeout=15) as resp:
        return resp.read().decode('utf-8', errors='replace')


def get_all_posts():
    """Get all post logNos and titles via API"""
    all_posts = []
    page = 1
    while True:
        url = f"https://blog.naver.com/PostTitleListAsync.naver?blogId={BLOG_ID}&currentPage={page}&categoryNo=0&countPerPage=30"
        raw = fetch(url)
        # Naver returns non-standard JSON with invalid escapes
        raw = re.sub(r'\\(?!["\\/bfnrtu])', r'\\\\', raw)
        data = json.loads(raw)
        posts = data.get("postList", [])
        if not posts:
            break
        for p in posts:
            title = unquote_plus(p["title"].replace("+", " "))
            all_posts.append({
                "logNo": p["logNo"],
                "title": title,
                "date": p["addDate"],
                "categoryNo": p["categoryNo"],
            })
        total = int(data.get("totalCount", 0))
        print(f"  Page {page}: {len(posts)} posts (total so far: {len(all_posts)}/{total})")
        if len(all_posts) >= total:
            break
        page += 1
        time.sleep(0.3)
    return all_posts


def get_post_content(log_no):
    """Fetch a single blog post content"""
    url = f"https://blog.naver.com/PostView.naver?blogId={BLOG_ID}&logNo={log_no}&redirect=Dlog&widgetTypeCall=true&directAccess=false"
    html = fetch(url)

    # Extract text from se-text-paragraph spans (new editor)
    paragraphs = re.findall(
        r'class="se-text-paragraph[^"]*"[^>]*>(.*?)</(?:p|div|span)',
        html, re.S
    )
    if paragraphs:
        lines = []
        for p in paragraphs:
            clean = re.sub(r'<[^>]+>', '', p).strip()
            if clean:
                lines.append(clean)
        text = '\n'.join(lines)
    else:
        # Fallback: old editor - extract from se_textarea or post-view
        m = re.search(r'class="se_textarea">(.*?)</div>', html, re.S)
        if not m:
            m = re.search(r'id="post-view\d*"[^>]*>(.*?)</div>\s*<!--', html, re.S)
        if not m:
            m = re.search(r'<div class="se-main-container">(.*)', html, re.S)
        content_html = m.group(1) if m else ""
        extractor = TextExtractor()
        extractor.feed(content_html)
        text = extractor.get_text()

    # Clean up
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r'[ \t]+', ' ', text)
    # Decode HTML entities
    import html as html_mod
    text = html_mod.unescape(text)
    return text.strip()


def main():
    print("=== 네이버 블로그 rob_ust 크롤링 시작 ===\n")

    # Step 1: Get all post list
    print("[1/2] 글 목록 수집 중...")
    posts = get_all_posts()
    print(f"\n총 {len(posts)}개 글 발견\n")

    # Save post index
    with open(os.path.join(OUT_DIR, "_index.json"), "w", encoding="utf-8") as f:
        json.dump(posts, f, ensure_ascii=False, indent=2)

    # Step 2: Fetch each post content
    print("[2/2] 각 글 본문 수집 중...")
    for i, post in enumerate(posts):
        outfile = os.path.join(OUT_DIR, f"{post['logNo']}.txt")
        if os.path.exists(outfile) and os.path.getsize(outfile) > 100:
            print(f"  [{i+1}/{len(posts)}] SKIP (cached): {post['title'][:50]}")
            continue
        try:
            content = get_post_content(post["logNo"])
            with open(outfile, "w", encoding="utf-8") as f:
                f.write(f"제목: {post['title']}\n")
                f.write(f"날짜: {post['date']}\n")
                f.write(f"---\n\n")
                f.write(content)
            print(f"  [{i+1}/{len(posts)}] OK ({len(content):,} chars): {post['title'][:50]}")
        except Exception as e:
            print(f"  [{i+1}/{len(posts)}] ERROR: {post['title'][:50]} - {e}")
        time.sleep(0.5)  # polite delay

    print(f"\n=== 완료! {OUT_DIR} 에 저장됨 ===")


if __name__ == "__main__":
    main()
