issacdataengine/migrate/crawl_docs.py

"""
Crawl InternDataEngine online docs and save as local markdown files.

Usage:
    python migrate/crawl_docs.py
    python migrate/crawl_docs.py --output docs_crawled
"""
import argparse
import os
import re
import time
import urllib.request
from html.parser import HTMLParser


BASE_URL = "https://internrobotics.github.io/InternDataEngine-Docs"

# All pages from the sitemap (extracted from VitePress hash map)
PAGES = [
    # Getting Started
    "/guides/installation.html",
    "/guides/quickstart.html",
    # Core Concepts
    "/concepts/workflows.html",
    "/concepts/skills.html",
    "/concepts/skills/overview.html",
    "/concepts/skills/pick.html",
    "/concepts/skills/place.html",
    "/concepts/skills/articulation.html",
    "/concepts/objects.html",
    "/concepts/cameras.html",
    "/concepts/robots.html",
    "/concepts/controllers.html",
    # Configuration
    "/config/yaml.html",
    "/config/dr.html",
    "/config/assets.html",
    # Customization
    "/custom/assets.html",
    "/custom/robot.html",
    "/custom/controller.html",
    "/custom/skill.html",
    "/custom/task.html",
    # Policy
    "/policy/training.html",
    # API
    "/api/controllers.html",
    "/api/skills.html",
    # Chinese versions
    "/zh/guides/installation.html",
    "/zh/guides/quickstart.html",
    "/zh/concepts/workflows.html",
    "/zh/concepts/skills.html",
    "/zh/concepts/skills/overview.html",
    "/zh/concepts/skills/pick.html",
    "/zh/concepts/skills/place.html",
    "/zh/concepts/skills/articulation.html",
    "/zh/concepts/tasks.html",
    "/zh/concepts/cameras.html",
    "/zh/concepts/robots.html",
    "/zh/concepts/controllers.html",
    "/zh/config/yaml.html",
    "/zh/config/dr.html",
    "/zh/config/assets.html",
    "/zh/custom/assets.html",
    "/zh/custom/robot.html",
    "/zh/custom/controller.html",
    "/zh/custom/skill.html",
    "/zh/custom/task.html",
]


class HTMLToMarkdown(HTMLParser):
    """Simple HTML to Markdown converter for VitePress content."""

    def __init__(self):
        super().__init__()
        self.output = []
        self.current_tag = None
        self.in_content = False
        self.in_code = False
        self.code_lang = ""
        self.skip_tags = {"script", "style", "nav", "header", "footer", "button"}
        self.skip_depth = 0
        self.heading_level = 0
        self.in_li = False
        self.in_pre = False
        self.in_a = False
        self.a_href = ""
        self.a_text = ""

    def handle_starttag(self, tag, attrs):
        attrs_dict = dict(attrs)
        classes = attrs_dict.get("class", "")

        # Skip navigation, header, footer
        if tag in self.skip_tags:
            self.skip_depth += 1
            return
        if self.skip_depth > 0:
            return

        # Track content area
        if "vp-doc" in classes or "VPDoc" in classes:
            self.in_content = True

        if not self.in_content:
            return

        if tag in ("h1", "h2", "h3", "h4", "h5", "h6"):
            self.heading_level = int(tag[1])
            self.output.append("\n" + "#" * self.heading_level + " ")
        elif tag == "p":
            self.output.append("\n\n")
        elif tag == "pre":
            self.in_pre = True
        elif tag == "code":
            if self.in_pre:
                self.in_code = True
                lang = attrs_dict.get("class", "")
                lang_match = re.search(r"language-(\w+)", lang)
                self.code_lang = lang_match.group(1) if lang_match else ""
                self.output.append(f"\n```{self.code_lang}\n")
            else:
                self.output.append("`")
        elif tag == "a":
            self.in_a = True
            self.a_href = attrs_dict.get("href", "")
            self.a_text = ""
        elif tag == "ul":
            self.output.append("\n")
        elif tag == "ol":
            self.output.append("\n")
        elif tag == "li":
            self.in_li = True
            self.output.append("- ")
        elif tag == "strong" or tag == "b":
            self.output.append("**")
        elif tag == "em" or tag == "i":
            self.output.append("*")
        elif tag == "br":
            self.output.append("\n")
        elif tag == "img":
            alt = attrs_dict.get("alt", "")
            src = attrs_dict.get("src", "")
            self.output.append(f"![{alt}]({src})")
        elif tag == "table":
            self.output.append("\n")
        elif tag == "tr":
            self.output.append("| ")
        elif tag == "th" or tag == "td":
            pass
        elif tag == "blockquote":
            self.output.append("\n> ")

    def handle_endtag(self, tag):
        if tag in self.skip_tags:
            self.skip_depth -= 1
            return
        if self.skip_depth > 0:
            return
        if not self.in_content:
            return

        if tag in ("h1", "h2", "h3", "h4", "h5", "h6"):
            self.output.append("\n")
            self.heading_level = 0
        elif tag == "pre":
            self.in_pre = False
        elif tag == "code":
            if self.in_code:
                self.in_code = False
                self.output.append("\n```\n")
            else:
                self.output.append("`")
        elif tag == "a":
            self.in_a = False
            if self.a_href and self.a_text:
                self.output.append(f"[{self.a_text.strip()}]({self.a_href})")
            self.a_text = ""
        elif tag == "li":
            self.in_li = False
            self.output.append("\n")
        elif tag == "strong" or tag == "b":
            self.output.append("**")
        elif tag == "em" or tag == "i":
            self.output.append("*")
        elif tag == "tr":
            self.output.append("\n")
        elif tag == "th" or tag == "td":
            self.output.append(" | ")
        elif tag == "p":
            self.output.append("\n")

    def handle_data(self, data):
        if self.skip_depth > 0:
            return

        if self.in_a:
            self.a_text += data
            return

        if self.in_content:
            if self.in_code:
                self.output.append(data)
            else:
                text = data.strip()
                if text:
                    self.output.append(text + " ")

    def get_markdown(self):
        text = "".join(self.output)
        # Clean up
        text = re.sub(r"\n{3,}", "\n\n", text)
        text = re.sub(r"[ \t]+\n", "\n", text)
        return text.strip()


def fetch_page(url):
    """Fetch a page and return HTML content."""
    try:
        req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
        with urllib.request.urlopen(req, timeout=15) as resp:
            return resp.read().decode("utf-8")
    except Exception as e:
        print(f"  ERROR: {e}")
        return None


def html_to_markdown(html_content):
    """Convert HTML to markdown."""
    parser = HTMLToMarkdown()
    parser.feed(html_content)
    return parser.get_markdown()


def main():
    parser = argparse.ArgumentParser(description="Crawl InternDataEngine docs")
    parser.add_argument("--output", default="docs_crawled", help="Output directory")
    parser.add_argument("--zh-only", action="store_true", help="Only crawl Chinese docs")
    parser.add_argument("--en-only", action="store_true", help="Only crawl English docs")
    args = parser.parse_args()

    os.makedirs(args.output, exist_ok=True)

    pages = PAGES
    if args.zh_only:
        pages = [p for p in PAGES if p.startswith("/zh/")]
    elif args.en_only:
        pages = [p for p in PAGES if not p.startswith("/zh/")]

    print(f"Crawling {len(pages)} pages to {args.output}/\n")

    for page_path in pages:
        url = BASE_URL + page_path
        # Convert path to filename: /guides/installation.html -> guides_installation.md
        md_name = page_path.strip("/").replace("/", "_").replace(".html", ".md")
        md_path = os.path.join(args.output, md_name)

        print(f"  {page_path} -> {md_name}", end=" ")

        html = fetch_page(url)
        if html is None:
            print("FAILED")
            continue

        md = html_to_markdown(html)
        if not md or len(md) < 50:
            # VitePress SPA - content is loaded via JS, try fetching the raw .md source
            # VitePress stores source at /InternDataEngine-Docs/page.html but the actual
            # markdown might be accessible differently
            print(f"(sparse content: {len(md)} chars, SPA rendering)")
        else:
            print(f"OK ({len(md)} chars)")

        with open(md_path, "w") as f:
            f.write(f"# Source: {url}\n\n")
            f.write(md)

        time.sleep(0.3)

    # Also try fetching raw markdown from GitHub
    print("\n\nAttempting raw markdown from GitHub...")
    gh_base = "https://raw.githubusercontent.com/InternRobotics/InternDataEngine/master/docs"
    # VitePress source files
    raw_pages = {
        "guides/installation.md": "guides_installation_raw.md",
        "guides/quickstart.md": "guides_quickstart_raw.md",
        "concepts/workflows.md": "concepts_workflows_raw.md",
        "concepts/objects.md": "concepts_objects_raw.md",
        "concepts/cameras.md": "concepts_cameras_raw.md",
        "concepts/robots.md": "concepts_robots_raw.md",
        "concepts/controllers.md": "concepts_controllers_raw.md",
        "concepts/skills/overview.md": "concepts_skills_overview_raw.md",
        "concepts/skills/pick.md": "concepts_skills_pick_raw.md",
        "concepts/skills/place.md": "concepts_skills_place_raw.md",
        "concepts/skills/articulation.md": "concepts_skills_articulation_raw.md",
        "config/yaml.md": "config_yaml_raw.md",
        "config/dr.md": "config_dr_raw.md",
        "config/assets.md": "config_assets_raw.md",
        "custom/assets.md": "custom_assets_raw.md",
        "custom/robot.md": "custom_robot_raw.md",
        "custom/controller.md": "custom_controller_raw.md",
        "custom/skill.md": "custom_skill_raw.md",
        "custom/task.md": "custom_task_raw.md",
        "policy/training.md": "policy_training_raw.md",
        "api/controllers.md": "api_controllers_raw.md",
        "api/skills.md": "api_skills_raw.md",
    }

    for src, dst in raw_pages.items():
        url = f"{gh_base}/{src}"
        dst_path = os.path.join(args.output, dst)
        print(f"  {src}", end=" ")
        content = fetch_page(url)
        if content and len(content) > 50 and not content.strip().startswith("<!DOCTYPE"):
            with open(dst_path, "w") as f:
                f.write(content)
            print(f"OK ({len(content)} chars)")
        else:
            print("NOT FOUND or HTML")
        time.sleep(0.3)

    print(f"\nDone. Files saved to {args.output}/")


if __name__ == "__main__":
    main()