""" Crawl InternDataEngine online docs and save as local markdown files. Usage: python migrate/crawl_docs.py python migrate/crawl_docs.py --output docs_crawled """ import argparse import os import re import time import urllib.request from html.parser import HTMLParser BASE_URL = "https://internrobotics.github.io/InternDataEngine-Docs" # All pages from the sitemap (extracted from VitePress hash map) PAGES = [ # Getting Started "/guides/installation.html", "/guides/quickstart.html", # Core Concepts "/concepts/workflows.html", "/concepts/skills.html", "/concepts/skills/overview.html", "/concepts/skills/pick.html", "/concepts/skills/place.html", "/concepts/skills/articulation.html", "/concepts/objects.html", "/concepts/cameras.html", "/concepts/robots.html", "/concepts/controllers.html", # Configuration "/config/yaml.html", "/config/dr.html", "/config/assets.html", # Customization "/custom/assets.html", "/custom/robot.html", "/custom/controller.html", "/custom/skill.html", "/custom/task.html", # Policy "/policy/training.html", # API "/api/controllers.html", "/api/skills.html", # Chinese versions "/zh/guides/installation.html", "/zh/guides/quickstart.html", "/zh/concepts/workflows.html", "/zh/concepts/skills.html", "/zh/concepts/skills/overview.html", "/zh/concepts/skills/pick.html", "/zh/concepts/skills/place.html", "/zh/concepts/skills/articulation.html", "/zh/concepts/tasks.html", "/zh/concepts/cameras.html", "/zh/concepts/robots.html", "/zh/concepts/controllers.html", "/zh/config/yaml.html", "/zh/config/dr.html", "/zh/config/assets.html", "/zh/custom/assets.html", "/zh/custom/robot.html", "/zh/custom/controller.html", "/zh/custom/skill.html", "/zh/custom/task.html", ] class HTMLToMarkdown(HTMLParser): """Simple HTML to Markdown converter for VitePress content.""" def __init__(self): super().__init__() self.output = [] self.current_tag = None self.in_content = False self.in_code = False self.code_lang = "" self.skip_tags = {"script", "style", "nav", "header", "footer", "button"} self.skip_depth = 0 self.heading_level = 0 self.in_li = False self.in_pre = False self.in_a = False self.a_href = "" self.a_text = "" def handle_starttag(self, tag, attrs): attrs_dict = dict(attrs) classes = attrs_dict.get("class", "") # Skip navigation, header, footer if tag in self.skip_tags: self.skip_depth += 1 return if self.skip_depth > 0: return # Track content area if "vp-doc" in classes or "VPDoc" in classes: self.in_content = True if not self.in_content: return if tag in ("h1", "h2", "h3", "h4", "h5", "h6"): self.heading_level = int(tag[1]) self.output.append("\n" + "#" * self.heading_level + " ") elif tag == "p": self.output.append("\n\n") elif tag == "pre": self.in_pre = True elif tag == "code": if self.in_pre: self.in_code = True lang = attrs_dict.get("class", "") lang_match = re.search(r"language-(\w+)", lang) self.code_lang = lang_match.group(1) if lang_match else "" self.output.append(f"\n```{self.code_lang}\n") else: self.output.append("`") elif tag == "a": self.in_a = True self.a_href = attrs_dict.get("href", "") self.a_text = "" elif tag == "ul": self.output.append("\n") elif tag == "ol": self.output.append("\n") elif tag == "li": self.in_li = True self.output.append("- ") elif tag == "strong" or tag == "b": self.output.append("**") elif tag == "em" or tag == "i": self.output.append("*") elif tag == "br": self.output.append("\n") elif tag == "img": alt = attrs_dict.get("alt", "") src = attrs_dict.get("src", "") self.output.append(f"![{alt}]({src})") elif tag == "table": self.output.append("\n") elif tag == "tr": self.output.append("| ") elif tag == "th" or tag == "td": pass elif tag == "blockquote": self.output.append("\n> ") def handle_endtag(self, tag): if tag in self.skip_tags: self.skip_depth -= 1 return if self.skip_depth > 0: return if not self.in_content: return if tag in ("h1", "h2", "h3", "h4", "h5", "h6"): self.output.append("\n") self.heading_level = 0 elif tag == "pre": self.in_pre = False elif tag == "code": if self.in_code: self.in_code = False self.output.append("\n```\n") else: self.output.append("`") elif tag == "a": self.in_a = False if self.a_href and self.a_text: self.output.append(f"[{self.a_text.strip()}]({self.a_href})") self.a_text = "" elif tag == "li": self.in_li = False self.output.append("\n") elif tag == "strong" or tag == "b": self.output.append("**") elif tag == "em" or tag == "i": self.output.append("*") elif tag == "tr": self.output.append("\n") elif tag == "th" or tag == "td": self.output.append(" | ") elif tag == "p": self.output.append("\n") def handle_data(self, data): if self.skip_depth > 0: return if self.in_a: self.a_text += data return if self.in_content: if self.in_code: self.output.append(data) else: text = data.strip() if text: self.output.append(text + " ") def get_markdown(self): text = "".join(self.output) # Clean up text = re.sub(r"\n{3,}", "\n\n", text) text = re.sub(r"[ \t]+\n", "\n", text) return text.strip() def fetch_page(url): """Fetch a page and return HTML content.""" try: req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}) with urllib.request.urlopen(req, timeout=15) as resp: return resp.read().decode("utf-8") except Exception as e: print(f" ERROR: {e}") return None def html_to_markdown(html_content): """Convert HTML to markdown.""" parser = HTMLToMarkdown() parser.feed(html_content) return parser.get_markdown() def main(): parser = argparse.ArgumentParser(description="Crawl InternDataEngine docs") parser.add_argument("--output", default="docs_crawled", help="Output directory") parser.add_argument("--zh-only", action="store_true", help="Only crawl Chinese docs") parser.add_argument("--en-only", action="store_true", help="Only crawl English docs") args = parser.parse_args() os.makedirs(args.output, exist_ok=True) pages = PAGES if args.zh_only: pages = [p for p in PAGES if p.startswith("/zh/")] elif args.en_only: pages = [p for p in PAGES if not p.startswith("/zh/")] print(f"Crawling {len(pages)} pages to {args.output}/\n") for page_path in pages: url = BASE_URL + page_path # Convert path to filename: /guides/installation.html -> guides_installation.md md_name = page_path.strip("/").replace("/", "_").replace(".html", ".md") md_path = os.path.join(args.output, md_name) print(f" {page_path} -> {md_name}", end=" ") html = fetch_page(url) if html is None: print("FAILED") continue md = html_to_markdown(html) if not md or len(md) < 50: # VitePress SPA - content is loaded via JS, try fetching the raw .md source # VitePress stores source at /InternDataEngine-Docs/page.html but the actual # markdown might be accessible differently print(f"(sparse content: {len(md)} chars, SPA rendering)") else: print(f"OK ({len(md)} chars)") with open(md_path, "w") as f: f.write(f"# Source: {url}\n\n") f.write(md) time.sleep(0.3) # Also try fetching raw markdown from GitHub print("\n\nAttempting raw markdown from GitHub...") gh_base = "https://raw.githubusercontent.com/InternRobotics/InternDataEngine/master/docs" # VitePress source files raw_pages = { "guides/installation.md": "guides_installation_raw.md", "guides/quickstart.md": "guides_quickstart_raw.md", "concepts/workflows.md": "concepts_workflows_raw.md", "concepts/objects.md": "concepts_objects_raw.md", "concepts/cameras.md": "concepts_cameras_raw.md", "concepts/robots.md": "concepts_robots_raw.md", "concepts/controllers.md": "concepts_controllers_raw.md", "concepts/skills/overview.md": "concepts_skills_overview_raw.md", "concepts/skills/pick.md": "concepts_skills_pick_raw.md", "concepts/skills/place.md": "concepts_skills_place_raw.md", "concepts/skills/articulation.md": "concepts_skills_articulation_raw.md", "config/yaml.md": "config_yaml_raw.md", "config/dr.md": "config_dr_raw.md", "config/assets.md": "config_assets_raw.md", "custom/assets.md": "custom_assets_raw.md", "custom/robot.md": "custom_robot_raw.md", "custom/controller.md": "custom_controller_raw.md", "custom/skill.md": "custom_skill_raw.md", "custom/task.md": "custom_task_raw.md", "policy/training.md": "policy_training_raw.md", "api/controllers.md": "api_controllers_raw.md", "api/skills.md": "api_skills_raw.md", } for src, dst in raw_pages.items(): url = f"{gh_base}/{src}" dst_path = os.path.join(args.output, dst) print(f" {src}", end=" ") content = fetch_page(url) if content and len(content) > 50 and not content.strip().startswith("