- Add pick_test_tube task: USDC asset repackaging, grasp generation, task config - Add tools: usdc_to_obj.py, repackage_test_tube.py, fix_test_tube_materials.py - Add custom_task_guide.md: full Chinese documentation for creating custom tasks - Add crawled InternDataEngine online docs (23 pages) - Add grasp generation script (gen_tube_grasp.py) and pipeline config
329 lines
11 KiB
Python
329 lines
11 KiB
Python
"""
|
|
Crawl InternDataEngine online docs and save as local markdown files.
|
|
|
|
Usage:
|
|
python migrate/crawl_docs.py
|
|
python migrate/crawl_docs.py --output docs_crawled
|
|
"""
|
|
import argparse
|
|
import os
|
|
import re
|
|
import time
|
|
import urllib.request
|
|
from html.parser import HTMLParser
|
|
|
|
|
|
BASE_URL = "https://internrobotics.github.io/InternDataEngine-Docs"
|
|
|
|
# All pages from the sitemap (extracted from VitePress hash map)
|
|
PAGES = [
|
|
# Getting Started
|
|
"/guides/installation.html",
|
|
"/guides/quickstart.html",
|
|
# Core Concepts
|
|
"/concepts/workflows.html",
|
|
"/concepts/skills.html",
|
|
"/concepts/skills/overview.html",
|
|
"/concepts/skills/pick.html",
|
|
"/concepts/skills/place.html",
|
|
"/concepts/skills/articulation.html",
|
|
"/concepts/objects.html",
|
|
"/concepts/cameras.html",
|
|
"/concepts/robots.html",
|
|
"/concepts/controllers.html",
|
|
# Configuration
|
|
"/config/yaml.html",
|
|
"/config/dr.html",
|
|
"/config/assets.html",
|
|
# Customization
|
|
"/custom/assets.html",
|
|
"/custom/robot.html",
|
|
"/custom/controller.html",
|
|
"/custom/skill.html",
|
|
"/custom/task.html",
|
|
# Policy
|
|
"/policy/training.html",
|
|
# API
|
|
"/api/controllers.html",
|
|
"/api/skills.html",
|
|
# Chinese versions
|
|
"/zh/guides/installation.html",
|
|
"/zh/guides/quickstart.html",
|
|
"/zh/concepts/workflows.html",
|
|
"/zh/concepts/skills.html",
|
|
"/zh/concepts/skills/overview.html",
|
|
"/zh/concepts/skills/pick.html",
|
|
"/zh/concepts/skills/place.html",
|
|
"/zh/concepts/skills/articulation.html",
|
|
"/zh/concepts/tasks.html",
|
|
"/zh/concepts/cameras.html",
|
|
"/zh/concepts/robots.html",
|
|
"/zh/concepts/controllers.html",
|
|
"/zh/config/yaml.html",
|
|
"/zh/config/dr.html",
|
|
"/zh/config/assets.html",
|
|
"/zh/custom/assets.html",
|
|
"/zh/custom/robot.html",
|
|
"/zh/custom/controller.html",
|
|
"/zh/custom/skill.html",
|
|
"/zh/custom/task.html",
|
|
]
|
|
|
|
|
|
class HTMLToMarkdown(HTMLParser):
|
|
"""Simple HTML to Markdown converter for VitePress content."""
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.output = []
|
|
self.current_tag = None
|
|
self.in_content = False
|
|
self.in_code = False
|
|
self.code_lang = ""
|
|
self.skip_tags = {"script", "style", "nav", "header", "footer", "button"}
|
|
self.skip_depth = 0
|
|
self.heading_level = 0
|
|
self.in_li = False
|
|
self.in_pre = False
|
|
self.in_a = False
|
|
self.a_href = ""
|
|
self.a_text = ""
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
attrs_dict = dict(attrs)
|
|
classes = attrs_dict.get("class", "")
|
|
|
|
# Skip navigation, header, footer
|
|
if tag in self.skip_tags:
|
|
self.skip_depth += 1
|
|
return
|
|
if self.skip_depth > 0:
|
|
return
|
|
|
|
# Track content area
|
|
if "vp-doc" in classes or "VPDoc" in classes:
|
|
self.in_content = True
|
|
|
|
if not self.in_content:
|
|
return
|
|
|
|
if tag in ("h1", "h2", "h3", "h4", "h5", "h6"):
|
|
self.heading_level = int(tag[1])
|
|
self.output.append("\n" + "#" * self.heading_level + " ")
|
|
elif tag == "p":
|
|
self.output.append("\n\n")
|
|
elif tag == "pre":
|
|
self.in_pre = True
|
|
elif tag == "code":
|
|
if self.in_pre:
|
|
self.in_code = True
|
|
lang = attrs_dict.get("class", "")
|
|
lang_match = re.search(r"language-(\w+)", lang)
|
|
self.code_lang = lang_match.group(1) if lang_match else ""
|
|
self.output.append(f"\n```{self.code_lang}\n")
|
|
else:
|
|
self.output.append("`")
|
|
elif tag == "a":
|
|
self.in_a = True
|
|
self.a_href = attrs_dict.get("href", "")
|
|
self.a_text = ""
|
|
elif tag == "ul":
|
|
self.output.append("\n")
|
|
elif tag == "ol":
|
|
self.output.append("\n")
|
|
elif tag == "li":
|
|
self.in_li = True
|
|
self.output.append("- ")
|
|
elif tag == "strong" or tag == "b":
|
|
self.output.append("**")
|
|
elif tag == "em" or tag == "i":
|
|
self.output.append("*")
|
|
elif tag == "br":
|
|
self.output.append("\n")
|
|
elif tag == "img":
|
|
alt = attrs_dict.get("alt", "")
|
|
src = attrs_dict.get("src", "")
|
|
self.output.append(f"")
|
|
elif tag == "table":
|
|
self.output.append("\n")
|
|
elif tag == "tr":
|
|
self.output.append("| ")
|
|
elif tag == "th" or tag == "td":
|
|
pass
|
|
elif tag == "blockquote":
|
|
self.output.append("\n> ")
|
|
|
|
def handle_endtag(self, tag):
|
|
if tag in self.skip_tags:
|
|
self.skip_depth -= 1
|
|
return
|
|
if self.skip_depth > 0:
|
|
return
|
|
if not self.in_content:
|
|
return
|
|
|
|
if tag in ("h1", "h2", "h3", "h4", "h5", "h6"):
|
|
self.output.append("\n")
|
|
self.heading_level = 0
|
|
elif tag == "pre":
|
|
self.in_pre = False
|
|
elif tag == "code":
|
|
if self.in_code:
|
|
self.in_code = False
|
|
self.output.append("\n```\n")
|
|
else:
|
|
self.output.append("`")
|
|
elif tag == "a":
|
|
self.in_a = False
|
|
if self.a_href and self.a_text:
|
|
self.output.append(f"[{self.a_text.strip()}]({self.a_href})")
|
|
self.a_text = ""
|
|
elif tag == "li":
|
|
self.in_li = False
|
|
self.output.append("\n")
|
|
elif tag == "strong" or tag == "b":
|
|
self.output.append("**")
|
|
elif tag == "em" or tag == "i":
|
|
self.output.append("*")
|
|
elif tag == "tr":
|
|
self.output.append("\n")
|
|
elif tag == "th" or tag == "td":
|
|
self.output.append(" | ")
|
|
elif tag == "p":
|
|
self.output.append("\n")
|
|
|
|
def handle_data(self, data):
|
|
if self.skip_depth > 0:
|
|
return
|
|
|
|
if self.in_a:
|
|
self.a_text += data
|
|
return
|
|
|
|
if self.in_content:
|
|
if self.in_code:
|
|
self.output.append(data)
|
|
else:
|
|
text = data.strip()
|
|
if text:
|
|
self.output.append(text + " ")
|
|
|
|
def get_markdown(self):
|
|
text = "".join(self.output)
|
|
# Clean up
|
|
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
text = re.sub(r"[ \t]+\n", "\n", text)
|
|
return text.strip()
|
|
|
|
|
|
def fetch_page(url):
|
|
"""Fetch a page and return HTML content."""
|
|
try:
|
|
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
|
|
with urllib.request.urlopen(req, timeout=15) as resp:
|
|
return resp.read().decode("utf-8")
|
|
except Exception as e:
|
|
print(f" ERROR: {e}")
|
|
return None
|
|
|
|
|
|
def html_to_markdown(html_content):
|
|
"""Convert HTML to markdown."""
|
|
parser = HTMLToMarkdown()
|
|
parser.feed(html_content)
|
|
return parser.get_markdown()
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Crawl InternDataEngine docs")
|
|
parser.add_argument("--output", default="docs_crawled", help="Output directory")
|
|
parser.add_argument("--zh-only", action="store_true", help="Only crawl Chinese docs")
|
|
parser.add_argument("--en-only", action="store_true", help="Only crawl English docs")
|
|
args = parser.parse_args()
|
|
|
|
os.makedirs(args.output, exist_ok=True)
|
|
|
|
pages = PAGES
|
|
if args.zh_only:
|
|
pages = [p for p in PAGES if p.startswith("/zh/")]
|
|
elif args.en_only:
|
|
pages = [p for p in PAGES if not p.startswith("/zh/")]
|
|
|
|
print(f"Crawling {len(pages)} pages to {args.output}/\n")
|
|
|
|
for page_path in pages:
|
|
url = BASE_URL + page_path
|
|
# Convert path to filename: /guides/installation.html -> guides_installation.md
|
|
md_name = page_path.strip("/").replace("/", "_").replace(".html", ".md")
|
|
md_path = os.path.join(args.output, md_name)
|
|
|
|
print(f" {page_path} -> {md_name}", end=" ")
|
|
|
|
html = fetch_page(url)
|
|
if html is None:
|
|
print("FAILED")
|
|
continue
|
|
|
|
md = html_to_markdown(html)
|
|
if not md or len(md) < 50:
|
|
# VitePress SPA - content is loaded via JS, try fetching the raw .md source
|
|
# VitePress stores source at /InternDataEngine-Docs/page.html but the actual
|
|
# markdown might be accessible differently
|
|
print(f"(sparse content: {len(md)} chars, SPA rendering)")
|
|
else:
|
|
print(f"OK ({len(md)} chars)")
|
|
|
|
with open(md_path, "w") as f:
|
|
f.write(f"# Source: {url}\n\n")
|
|
f.write(md)
|
|
|
|
time.sleep(0.3)
|
|
|
|
# Also try fetching raw markdown from GitHub
|
|
print("\n\nAttempting raw markdown from GitHub...")
|
|
gh_base = "https://raw.githubusercontent.com/InternRobotics/InternDataEngine/master/docs"
|
|
# VitePress source files
|
|
raw_pages = {
|
|
"guides/installation.md": "guides_installation_raw.md",
|
|
"guides/quickstart.md": "guides_quickstart_raw.md",
|
|
"concepts/workflows.md": "concepts_workflows_raw.md",
|
|
"concepts/objects.md": "concepts_objects_raw.md",
|
|
"concepts/cameras.md": "concepts_cameras_raw.md",
|
|
"concepts/robots.md": "concepts_robots_raw.md",
|
|
"concepts/controllers.md": "concepts_controllers_raw.md",
|
|
"concepts/skills/overview.md": "concepts_skills_overview_raw.md",
|
|
"concepts/skills/pick.md": "concepts_skills_pick_raw.md",
|
|
"concepts/skills/place.md": "concepts_skills_place_raw.md",
|
|
"concepts/skills/articulation.md": "concepts_skills_articulation_raw.md",
|
|
"config/yaml.md": "config_yaml_raw.md",
|
|
"config/dr.md": "config_dr_raw.md",
|
|
"config/assets.md": "config_assets_raw.md",
|
|
"custom/assets.md": "custom_assets_raw.md",
|
|
"custom/robot.md": "custom_robot_raw.md",
|
|
"custom/controller.md": "custom_controller_raw.md",
|
|
"custom/skill.md": "custom_skill_raw.md",
|
|
"custom/task.md": "custom_task_raw.md",
|
|
"policy/training.md": "policy_training_raw.md",
|
|
"api/controllers.md": "api_controllers_raw.md",
|
|
"api/skills.md": "api_skills_raw.md",
|
|
}
|
|
|
|
for src, dst in raw_pages.items():
|
|
url = f"{gh_base}/{src}"
|
|
dst_path = os.path.join(args.output, dst)
|
|
print(f" {src}", end=" ")
|
|
content = fetch_page(url)
|
|
if content and len(content) > 50 and not content.strip().startswith("<!DOCTYPE"):
|
|
with open(dst_path, "w") as f:
|
|
f.write(content)
|
|
print(f"OK ({len(content)} chars)")
|
|
else:
|
|
print("NOT FOUND or HTML")
|
|
time.sleep(0.3)
|
|
|
|
print(f"\nDone. Files saved to {args.output}/")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|