diff --git a/README.md b/README.md index cfe0611..8a42266 100644 --- a/README.md +++ b/README.md @@ -171,6 +171,7 @@ post.from_markdown(footnote_markdown, api=api) post.paragraph(content=[{"content": "Some claim."}]).footnote_anchor(1) post.footnote(1, "The note text, with **formatting** allowed.") + draft = api.post_draft(post.get_draft()) # set section (can only be done after first posting the draft) diff --git a/poetry.lock b/poetry.lock index 787b983..c22db48 100644 --- a/poetry.lock +++ b/poetry.lock @@ -21,7 +21,7 @@ version = "0.7.0" description = "Reusable constraint types to use with typing.Annotated" optional = false python-versions = ">=3.8" -groups = ["dev", "mcp"] +groups = ["mcp"] files = [ {file = "annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53"}, {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"}, @@ -951,14 +951,14 @@ type = ["pygobject-stubs", "pytest-mypy (>=1.0.1)", "shtab", "types-pywin32"] [[package]] name = "markdown-it-py" -version = "4.0.0" +version = "3.0.0" description = "Python port of markdown-it. Markdown parsing, done right!" optional = false -python-versions = ">=3.10" -groups = ["mcp"] +python-versions = ">=3.8" +groups = ["main", "mcp"] files = [ - {file = "markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147"}, - {file = "markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3"}, + {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"}, + {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"}, ] [package.dependencies] @@ -966,12 +966,13 @@ mdurl = ">=0.1,<1.0" [package.extras] benchmarking = ["psutil", "pytest", "pytest-benchmark"] -compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "markdown-it-pyrs", "mistletoe (>=1.0,<2.0)", "mistune (>=3.0,<4.0)", "panflute (>=2.3,<3.0)"] +code-style = ["pre-commit (>=3.0,<4.0)"] +compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"] linkify = ["linkify-it-py (>=1,<3)"] -plugins = ["mdit-py-plugins (>=0.5.0)"] +plugins = ["mdit-py-plugins"] profiling = ["gprof2dot"] -rtd = ["ipykernel", "jupyter_sphinx", "mdit-py-plugins (>=0.5.0)", "myst-parser", "pyyaml", "sphinx", "sphinx-book-theme (>=1.0,<2.0)", "sphinx-copybutton", "sphinx-design"] -testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions", "requests"] +rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"] +testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] [[package]] name = "mcp" @@ -1006,13 +1007,33 @@ cli = ["python-dotenv (>=1.0.0)", "typer (>=0.16.0)"] rich = ["rich (>=13.9.4)"] ws = ["websockets (>=15.0.1)"] +[[package]] +name = "mdit-py-plugins" +version = "0.4.2" +description = "Collection of plugins for markdown-it-py" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "mdit_py_plugins-0.4.2-py3-none-any.whl", hash = "sha256:0c673c3f889399a33b95e88d2f0d111b4447bdfea7f237dab2d488f459835636"}, + {file = "mdit_py_plugins-0.4.2.tar.gz", hash = "sha256:5f2cd1fdb606ddf152d37ec30e46101a60512bc0e5fa1a7002c36647b09e26b5"}, +] + +[package.dependencies] +markdown-it-py = ">=1.0.0,<4.0.0" + +[package.extras] +code-style = ["pre-commit"] +rtd = ["myst-parser", "sphinx-book-theme"] +testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] + [[package]] name = "mdurl" version = "0.1.2" description = "Markdown URL utilities" optional = false python-versions = ">=3.7" -groups = ["mcp"] +groups = ["main", "mcp"] files = [ {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, @@ -1160,7 +1181,7 @@ version = "2.12.5" description = "Data validation using Python type hints" optional = false python-versions = ">=3.9" -groups = ["dev", "mcp"] +groups = ["mcp"] files = [ {file = "pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d"}, {file = "pydantic-2.12.5.tar.gz", hash = "sha256:4d351024c75c0f085a9febbb665ce8c0c6ec5d30e903bdb6394b7ede26aebb49"}, @@ -1183,7 +1204,7 @@ version = "2.41.5" description = "Core functionality for Pydantic validation and serialization" optional = false python-versions = ">=3.9" -groups = ["dev", "mcp"] +groups = ["mcp"] files = [ {file = "pydantic_core-2.41.5-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:77b63866ca88d804225eaa4af3e664c5faf3568cea95360d21f4725ab6e07146"}, {file = "pydantic_core-2.41.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dfa8a0c812ac681395907e71e1274819dec685fec28273a28905df579ef137e2"}, @@ -1317,7 +1338,7 @@ version = "2.14.2" description = "Settings management using Pydantic" optional = false python-versions = ">=3.10" -groups = ["dev", "mcp"] +groups = ["mcp"] files = [ {file = "pydantic_settings-2.14.2-py3-none-any.whl", hash = "sha256:a20c97b37910b6550d5ea50fbcc2d4187defe58cd57070b73863d069419c9440"}, {file = "pydantic_settings-2.14.2.tar.gz", hash = "sha256:c19dd64b19097f1de80184f0cc7b0272a13ae6e170cbf240a3e27e381ed14a5f"}, @@ -1387,7 +1408,7 @@ version = "1.2.2" description = "Read key-value pairs from a .env file and set them as environment variables" optional = false python-versions = ">=3.10" -groups = ["main", "dev", "mcp"] +groups = ["main", "mcp"] files = [ {file = "python_dotenv-1.2.2-py3-none-any.whl", hash = "sha256:1d8214789a24de455a8b8bd8ae6fe3c6b69a5e3d64aa8a8e5d68e694bbcb285a"}, {file = "python_dotenv-1.2.2.tar.gz", hash = "sha256:2c371a91fbd7ba082c2c1dc1f8bf89ca22564a087c2c287cd9b662adde799cf3"}, @@ -1860,7 +1881,7 @@ version = "4.15.0" description = "Backported and Experimental Type Hints for Python 3.9+" optional = false python-versions = ">=3.9" -groups = ["dev", "mcp"] +groups = ["mcp"] files = [ {file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"}, {file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"}, @@ -1872,7 +1893,7 @@ version = "0.4.2" description = "Runtime typing introspection tools" optional = false python-versions = ">=3.9" -groups = ["dev", "mcp"] +groups = ["mcp"] files = [ {file = "typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7"}, {file = "typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464"}, @@ -2147,4 +2168,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = "<4.0,>=3.10" -content-hash = "add336e7fc3c6fa9a72fb7e0b3e2950d49783e2eeb82e1783c61de0c6f09054f" +content-hash = "11a5871352eb39ede1d5316615d0d98fb5da8481a3b3a294ba6a772de59c4420" diff --git a/pyproject.toml b/pyproject.toml index 67f5a0d..68c91b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,8 @@ python = "<4.0,>=3.10" requests = "^2.32.0" python-dotenv = "^1.2.1" PyYAML = "^6.0" +markdown-it-py = "^3.0" +mdit-py-plugins = "^0.4" [tool.poetry.group.dev.dependencies] diff --git a/substack/mdrender.py b/substack/mdrender.py new file mode 100644 index 0000000..092541e --- /dev/null +++ b/substack/mdrender.py @@ -0,0 +1,180 @@ +"""Markdown -> Substack ProseMirror via markdown-it-py. + +Implements Post.from_markdown() using a real CommonMark parser (markdown-it-py) +plus the standard footnote plugin, with a small renderer that walks the syntax +tree into Substack's node schema. + +Node construction goes through ``substack.nodes`` so the (undocumented) schema +lives in exactly one place. +""" + +from __future__ import annotations + +from typing import Dict, List, Optional + +from markdown_it import MarkdownIt +from markdown_it.tree import SyntaxTreeNode +from mdit_py_plugins.footnote import footnote_plugin + +from substack import nodes +from substack.nodes import MarkType, NodeType + +_MARK_FOR = { + "strong": {"type": MarkType.STRONG}, + "em": {"type": MarkType.EM}, + "s": {"type": MarkType.STRIKETHROUGH}, +} + + +def _make_parser() -> MarkdownIt: + return MarkdownIt("commonmark").use(footnote_plugin).enable("strikethrough") + + +def _coalesce(out_nodes: List[Dict]) -> List[Dict]: + """Merge adjacent text nodes that carry identical marks (e.g. softbreaks).""" + merged: List[Dict] = [] + for node in out_nodes: + if ( + merged + and node.get("type") == NodeType.TEXT + and merged[-1].get("type") == NodeType.TEXT + and node.get("marks") == merged[-1].get("marks") + ): + merged[-1]["text"] += node["text"] + else: + merged.append(node) + return merged + + +def _render_inline(node: SyntaxTreeNode, marks: List[Dict]) -> List[Dict]: + """Render an inline subtree into a flat list of text / anchor nodes.""" + out: List[Dict] = [] + for child in node.children: + t = child.type + if t == "text": + if child.content: + out.append(nodes.text(child.content, marks)) + elif t == "code_inline": + out.append(nodes.text(child.content, marks + [nodes.code_mark()])) + elif t in _MARK_FOR: + out.extend(_render_inline(child, marks + [_MARK_FOR[t]])) + elif t == "link": + href = child.attrs.get("href", "") + out.extend(_render_inline(child, marks + [nodes.link_mark(href)])) + elif t in ("softbreak", "hardbreak"): + out.append(nodes.text(" ", marks)) + elif t == "footnote_ref": + out.append(nodes.footnote_anchor(child.meta["id"] + 1)) + elif t == "image": + # Inline images are rare in this schema; fall back to alt text. + alt = child.attrs.get("alt") or "".join( + c.content for c in child.children if c.type == "text" + ) + if alt: + out.append(nodes.text(alt, marks)) + return _coalesce(out) + + +def _only_image(inline: SyntaxTreeNode) -> Optional[SyntaxTreeNode]: + """If an inline node is just an image (optionally wrapped in a link), return it.""" + kids = [c for c in inline.children if c.type != "softbreak"] + if len(kids) == 1 and kids[0].type == "image": + return kids[0] + if len(kids) == 1 and kids[0].type == "link": + inner = [c for c in kids[0].children if c.type != "softbreak"] + if len(inner) == 1 and inner[0].type == "image": + img = inner[0] + img._link_href = kids[0].attrs.get("href") # type: ignore[attr-defined] + return img + return None + + +def _captioned_image(img: SyntaxTreeNode, api) -> Dict: + src = img.attrs.get("src", "") + if src.startswith("/"): + src = src[1:] + if api is not None and not src.startswith("http"): + try: + src = api.get_image(src).get("url") + except Exception: + pass + # markdown-it stores the image alt text as the node's content, not in attrs. + alt = img.content or img.attrs.get("alt") or None + return nodes.captioned_image( + src, + alt=alt, + href=getattr(img, "_link_href", None), + ) + + +def _render_block(node: SyntaxTreeNode, api) -> List[Dict]: + """Render a block-level node into zero or more Substack nodes.""" + t = node.type + + if t == "paragraph": + inline = node.children[0] + img = _only_image(inline) + if img is not None: + return [_captioned_image(img, api)] + return [nodes.paragraph(_render_inline(inline, []))] + + if t == "heading": + level = int(node.tag[1]) + return [nodes.heading(_render_inline(node.children[0], []), level=level)] + + if t == "hr": + return [nodes.horizontal_rule()] + + if t in ("fence", "code_block"): + return [ + nodes.code_block( + node.content.rstrip("\n"), language=node.info.strip() or None + ) + ] + + if t == "blockquote": + paras: List[Dict] = [] + for child in node.children: + paras.extend(_render_block(child, api)) + return [nodes.blockquote(paras)] + + if t == "bullet_list": + return [nodes.bullet_list(_render_list_items(node, api))] + + if t == "ordered_list": + return [nodes.ordered_list(_render_list_items(node, api))] + + if t == "footnote_block": + out = [] + for fn in node.children: + number = fn.meta["id"] + 1 + paras = [ + nodes.paragraph(_render_inline(child.children[0], [])) + for child in fn.children + if child.type == "paragraph" + ] + out.append(nodes.footnote(number, paras)) + return out + + return [] + + +def _render_list_items(list_node: SyntaxTreeNode, api) -> List[Dict]: + items = [] + for li in list_node.children: + # A list_item built by nodes.list_item wraps inline content in a single + # paragraph; here items may already contain block nodes, so build directly. + content: List[Dict] = [] + for child in li.children: + content.extend(_render_block(child, api)) + items.append({"type": NodeType.LIST_ITEM, "content": content}) + return items + + +def markdown_to_doc(markdown_content: str, api=None) -> List[Dict]: + """Convert Markdown into a list of Substack ProseMirror block nodes.""" + tree = SyntaxTreeNode(_make_parser().parse(markdown_content)) + out: List[Dict] = [] + for node in tree.children: + out.extend(_render_block(node, api)) + return out diff --git a/substack/nodes.py b/substack/nodes.py new file mode 100644 index 0000000..51cf3a9 --- /dev/null +++ b/substack/nodes.py @@ -0,0 +1,138 @@ +"""ProseMirror node builders for Substack documents. + +Centralises the (undocumented) Substack ProseMirror schema in one place. +The node-type strings ("paragraph", "footnoteAnchor", "image2", ...) and +their shapes live here rather than as inline dict literals scattered across +post.py, giving: + + * one source of truth for node shapes (so a schema change is a one-line fix), + * discoverable, typed constructors instead of bare dict literals, + * a natural seam for validation. + +The builders intentionally return plain dicts so they stay 100% compatible with +the existing draft_body structure. +""" + +from __future__ import annotations + +from typing import Dict, List, Optional + + +class NodeType: + DOC = "doc" + PARAGRAPH = "paragraph" + HEADING = "heading" + TEXT = "text" + BLOCKQUOTE = "blockquote" + CODE_BLOCK = "codeBlock" + HORIZONTAL_RULE = "horizontal_rule" + BULLET_LIST = "bullet_list" + ORDERED_LIST = "ordered_list" + LIST_ITEM = "list_item" + FOOTNOTE = "footnote" + FOOTNOTE_ANCHOR = "footnoteAnchor" + CAPTIONED_IMAGE = "captionedImage" + + +class MarkType: + STRONG = "strong" + EM = "em" + CODE = "code" + STRIKETHROUGH = "strikethrough" + LINK = "link" + + +def code_mark() -> Dict: + return {"type": MarkType.CODE} + + +def text(value: str, marks: Optional[List[Dict]] = None) -> Dict: + node: Dict = {"type": NodeType.TEXT, "text": value} + if marks: + node["marks"] = marks + return node + + +def link_mark(href: str) -> Dict: + return {"type": MarkType.LINK, "attrs": {"href": href}} + + +def paragraph(content: Optional[List[Dict]] = None) -> Dict: + return {"type": NodeType.PARAGRAPH, "content": content or []} + + +def heading(content: List[Dict], level: int = 1) -> Dict: + return {"type": NodeType.HEADING, "content": content, "attrs": {"level": level}} + + +def horizontal_rule() -> Dict: + return {"type": NodeType.HORIZONTAL_RULE} + + +def blockquote(paragraphs: List[Dict]) -> Dict: + node: Dict = {"type": NodeType.BLOCKQUOTE} + if paragraphs: + node["content"] = paragraphs + return node + + +def list_item(content_nodes: List[Dict]) -> Dict: + return { + "type": NodeType.LIST_ITEM, + "content": [paragraph(content_nodes)], + } + + +def bullet_list(items: List[Dict]) -> Dict: + return {"type": NodeType.BULLET_LIST, "content": items} + + +def ordered_list(items: List[Dict]) -> Dict: + return {"type": NodeType.ORDERED_LIST, "content": items} + + +def code_block(code: str, language: Optional[str] = None) -> Dict: + node: Dict = {"type": NodeType.CODE_BLOCK, "content": [text(code)]} + if language: + node["attrs"] = {"language": language} + return node + + +def captioned_image( + src: str, alt: Optional[str] = None, href: Optional[str] = None +) -> Dict: + return { + "type": NodeType.CAPTIONED_IMAGE, + "content": [ + { + "type": "image2", + "attrs": { + "src": src, + "fullscreen": False, + "imageSize": "normal", + "height": 819, + "width": 1456, + "resizeWidth": 728, + "bytes": None, + "alt": alt, + "title": None, + "type": None, + "href": href, + "belowTheFold": False, + "internalRedirect": None, + }, + } + ], + } + + +def footnote_anchor(number: int) -> Dict: + return {"type": NodeType.FOOTNOTE_ANCHOR, "attrs": {"number": number}} + + +def footnote(number: int, paragraphs: List[Dict]) -> Dict: + return { + "type": NodeType.FOOTNOTE, + "attrs": {"number": number}, + "content": paragraphs or [paragraph()], + } diff --git a/substack/post.py b/substack/post.py index 690b402..f325048 100644 --- a/substack/post.py +++ b/substack/post.py @@ -11,10 +11,7 @@ __all__ = ["Post", "parse_inline", "tokens_to_text_nodes"] from substack.exceptions import SectionNotExistsException - -# Markdown footnotes: ``text.[^label]`` references and ``[^label]: definition`` lines. -FOOTNOTE_REFERENCE_PATTERN = re.compile(r"\[\^([^\]]+)\]") -FOOTNOTE_DEFINITION_PATTERN = re.compile(r"^\[\^([^\]]+)\]:\s?(.*)$") +from substack import nodes def tokens_to_text_nodes(tokens: List[Dict]) -> List[Dict]: @@ -560,7 +557,7 @@ def footnote_anchor(self, number: int): """ content = self.draft_body["content"][-1].get("content", []) - content += [{"type": "footnoteAnchor", "attrs": {"number": number}}] + content += [nodes.footnote_anchor(number)] self.draft_body["content"][-1]["content"] = content return self @@ -586,147 +583,19 @@ def footnote(self, number: int, content=None): for chunk in re.split(r"\n\s*\n", content): chunk = chunk.strip() if chunk: - paragraphs.append( - {"type": "paragraph", "content": tokens_to_text_nodes(parse_inline(chunk))} - ) + paragraphs.append(nodes.paragraph(tokens_to_text_nodes(parse_inline(chunk)))) elif isinstance(content, list): # Accept either parse_inline tokens ({"content": ...}) or text nodes. if content and content[0].get("type") == "text": text_nodes = content else: text_nodes = tokens_to_text_nodes(content) - paragraphs.append({"type": "paragraph", "content": text_nodes}) - - if not paragraphs: - paragraphs = [{"type": "paragraph", "content": []}] + paragraphs.append(nodes.paragraph(text_nodes)) - node: Dict = { - "type": "footnote", - "attrs": {"number": number}, - "content": paragraphs, - } + node: Dict = nodes.footnote(number, paragraphs) self.draft_body["content"] = self.draft_body.get("content", []) + [node] return self - @staticmethod - def _extract_footnote_definitions(markdown_content: str): - """ - - Pull ``[^label]: definition`` lines out of the Markdown. - - Definitions may wrap onto indented continuation lines and may contain - multiple paragraphs (blank line followed by an indented block). Returns - the body with definitions removed plus a {label: definition_text} mapping, - where paragraphs are separated by a blank line. - - """ - lines = markdown_content.split("\n") - body_lines: List[str] = [] - definitions: Dict[str, str] = {} - in_code_fence = False - i = 0 - while i < len(lines): - # Track fenced code blocks so footnote-like lines inside them are - # left untouched. - if lines[i].lstrip().startswith("```"): - in_code_fence = not in_code_fence - body_lines.append(lines[i]) - i += 1 - continue - match = None if in_code_fence else FOOTNOTE_DEFINITION_PATTERN.match(lines[i]) - if match: - label, first = match.group(1), match.group(2) - paragraphs: List[str] = [] - current = [first.strip()] if first.strip() else [] - i += 1 - while i < len(lines): - line = lines[i] - if line.strip() == "": - # A blank line stays in the footnote only if the next - # non-empty line is indented (a further paragraph). - nxt = i + 1 - if ( - nxt < len(lines) - and lines[nxt].strip() - and lines[nxt][:1] in (" ", "\t") - ): - if current: - paragraphs.append(" ".join(current)) - current = [] - i += 1 - continue - break - if line[:1] in (" ", "\t"): - current.append(line.strip()) - i += 1 - else: - break - if current: - paragraphs.append(" ".join(current)) - definitions[label] = "\n\n".join(paragraphs) - else: - body_lines.append(lines[i]) - i += 1 - return "\n".join(body_lines), definitions - - @staticmethod - def _number_footnotes(markdown_content: str, definitions: Dict[str, str]): - """Number footnotes by order of first inline reference in the body.""" - order: List[str] = [] - for match in FOOTNOTE_REFERENCE_PATTERN.finditer(markdown_content): - label = match.group(1) - if label in definitions and label not in order: - order.append(label) - # Defined-but-unreferenced footnotes go last, in definition order. - for label in definitions: - if label not in order: - order.append(label) - return {label: index + 1 for index, label in enumerate(order)} - - def _inject_footnote_anchors(self, node: Dict, numbers_by_label: Dict[str, int]): - """Recursively replace ``[^label]`` in text nodes with footnoteAnchor nodes.""" - # Never rewrite the contents of a code block. - if node.get("type") == "codeBlock": - return - content = node.get("content") - if not isinstance(content, list): - return - new_content: List[Dict] = [] - for child in content: - text = child.get("text", "") - has_code_mark = any( - mark.get("type") == "code" for mark in (child.get("marks") or []) - ) - if ( - child.get("type") == "text" - and not has_code_mark - and FOOTNOTE_REFERENCE_PATTERN.search(text) - ): - marks = child.get("marks") - last = 0 - for match in FOOTNOTE_REFERENCE_PATTERN.finditer(text): - label = match.group(1) - if label not in numbers_by_label: - continue # Unknown label: leave the literal text in place. - if match.start() > last: - segment = {"type": "text", "text": text[last:match.start()]} - if marks: - segment["marks"] = marks - new_content.append(segment) - new_content.append( - {"type": "footnoteAnchor", "attrs": {"number": numbers_by_label[label]}} - ) - last = match.end() - if last < len(text): - segment = {"type": "text", "text": text[last:]} - if marks: - segment["marks"] = marks - new_content.append(segment) - else: - self._inject_footnote_anchors(child, numbers_by_label) - new_content.append(child) - node["content"] = new_content - def from_markdown(self, markdown_content: str, api=None): """ Parse Markdown content and add it to the post. @@ -760,290 +629,8 @@ def from_markdown(self, markdown_content: str, api=None): >>> post = Post("Title", "Subtitle", user_id) >>> post.from_markdown("# Heading\\n\\nThis is **bold** text with [a link](https://example.com).") """ - # Footnotes: extract ``[^label]: ...`` definitions and number them by - # order of first reference before parsing the rest of the body. - markdown_content, footnote_definitions = self._extract_footnote_definitions( - markdown_content - ) - footnote_numbers = self._number_footnotes(markdown_content, footnote_definitions) - - lines = markdown_content.split("\n") - blocks = [] - current_block: List[str] = [] - in_code_block = False - code_block_language = None - - for line in lines: - # Check for fenced code block start/end - if line.strip().startswith("```"): - if in_code_block: - # End of code block - if current_block: - blocks.append({ - "type": "code", - "language": code_block_language, - "content": "\n".join(current_block) - }) - current_block = [] - in_code_block = False - code_block_language = None - else: - # Start of code block - if current_block: - blocks.append({"type": "text", "content": "\n".join(current_block)}) - current_block = [] - # Extract language if specified - language = line.strip()[3:].strip() - code_block_language = language if language else None - in_code_block = True - continue - - if in_code_block: - # Inside code block - collect lines as-is - current_block.append(line) - else: - # Regular content - if line.strip() == "": - # Empty line - end current block if it has content - if current_block: - blocks.append({"type": "text", "content": "\n".join(current_block)}) - current_block = [] - else: - current_block.append(line) - - # Add any remaining content - if current_block: - if in_code_block: - blocks.append({ - "type": "code", - "language": code_block_language, - "content": "\n".join(current_block) - }) - else: - blocks.append({"type": "text", "content": "\n".join(current_block)}) - - # Process blocks - for block in blocks: - if block["type"] == "code": - # Add code block - code_content = block.get("content", "").strip() - if code_content: - # Substack uses "codeBlock" type - code_attrs = {} - if block.get("language"): - code_attrs["language"] = block["language"] - self.add({ - "type": "codeBlock", - "content": code_content, # Pass as string, code_block method will handle it - "attrs": code_attrs - }) - else: - # Process text block - text_content = block.get("content", "").strip() - if not text_content: - continue - - # Check for horizontal rule: ---, ***, ___ - if re.match(r'^(\*{3,}|-{3,}|_{3,})\s*$', text_content): - self.horizontal_rule() - continue - - # Process headings (lines starting with '#' characters) - if text_content.startswith("#"): - level = len(text_content) - len(text_content.lstrip("#")) - heading_text = text_content.lstrip("#").strip() - if heading_text: # Only add if there's actual text - self.heading(content=heading_text, level=min(level, 6)) - - # Process images using Markdown image syntax: ![Alt](URL) - # Also handle linked images: [![Alt](image_url)](link_url) - elif text_content.startswith("!") or (text_content.startswith("[") and "![" in text_content): - # Check for linked image first: [![alt](img)](link) - linked_image_match = re.match(r'\[!\[([^\]]*)\]\(([^)]+)\)\]\(([^)]+)\)', text_content) - if linked_image_match: - # Linked image - create image with href - alt_text = linked_image_match.group(1) - image_url = linked_image_match.group(2) - link_url = linked_image_match.group(3) - - # Adjust image URL if it starts with a slash - image_url = image_url[1:] if image_url.startswith("/") else image_url - - # If api is provided and image_url is a local file, upload it - if api is not None: - try: - image = api.get_image(image_url) - image_url = image.get("url") - except Exception: - # If upload fails, use original URL - pass - - self.add({ - "type": "captionedImage", - "src": image_url, - "alt": alt_text, - "href": link_url - }) - else: - # Regular image: ![Alt](URL) - match = re.match(r"!\[.*?\]\((.*?)\)", text_content) - if match: - image_url = match.group(1) - # Adjust image URL if it starts with a slash - image_url = image_url[1:] if image_url.startswith("/") else image_url - - # If api is provided and image_url is a local file, upload it - if api is not None: - try: - image = api.get_image(image_url) - image_url = image.get("url") - except Exception: - # If upload fails, use original URL - pass - - self.add({"type": "captionedImage", "src": image_url}) - - # Process paragraphs, bullet lists, ordered lists, or blockquotes - else: - if "\n" in text_content: - # Process each line, grouping consecutive bullets/ordered items - # into list nodes and consecutive blockquote lines into a - # single blockquote node. - pending_bullets: List[List[Dict]] = [] - pending_quotes: List[str] = [] - pending_ordered: List[List[Dict]] = [] - - def flush_bullets(): - if not pending_bullets: - return - list_items = [] - for bullet_nodes in pending_bullets: - list_items.append({ - "type": "list_item", - "content": [{"type": "paragraph", "content": bullet_nodes}], - }) - self.draft_body["content"].append( - {"type": "bullet_list", "content": list_items} - ) - pending_bullets.clear() - - def flush_quotes(): - if not pending_quotes: - return - paragraphs: List[Dict] = [] - for quote_line in pending_quotes: - tokens = parse_inline(quote_line) - text_nodes = tokens_to_text_nodes(tokens) - if text_nodes: - paragraphs.append({"type": "paragraph", "content": text_nodes}) - node: Dict = {"type": "blockquote"} - if paragraphs: - node["content"] = paragraphs - self.draft_body["content"].append(node) - pending_quotes.clear() - - def flush_ordered(): - if not pending_ordered: - return - list_items = [] - for item_nodes in pending_ordered: - list_items.append({ - "type": "list_item", - "content": [{"type": "paragraph", "content": item_nodes}], - }) - self.draft_body["content"].append( - {"type": "ordered_list", "content": list_items} - ) - pending_ordered.clear() - - for line in text_content.split("\n"): - line = line.strip() - if not line: - flush_bullets() - flush_ordered() - flush_quotes() - continue - - # Check for blockquote marker - if line.startswith("> ") or line == ">": - flush_bullets() - flush_ordered() - quote_text = line[2:] if line.startswith("> ") else "" - pending_quotes.append(quote_text) - continue - - # Check for ordered list marker - ordered_match = re.match(r'^(\d+)\.\s+(.*)', line) - if ordered_match: - flush_bullets() - flush_quotes() - item_text = ordered_match.group(2).strip() - tokens = parse_inline(item_text) - text_nodes = tokens_to_text_nodes(tokens) - if text_nodes: - pending_ordered.append(text_nodes) - continue - - # Check for bullet marker - bullet_text = None - if line.startswith("* "): - bullet_text = line[2:].strip() - elif line.startswith("- "): - bullet_text = line[2:].strip() - elif line.startswith("*") and not line.startswith("**"): - bullet_text = line[1:].strip() - - if bullet_text is not None: - flush_ordered() - flush_quotes() - tokens = parse_inline(bullet_text) - text_nodes = tokens_to_text_nodes(tokens) - if text_nodes: - pending_bullets.append(text_nodes) - else: - flush_bullets() - flush_ordered() - flush_quotes() - tokens = parse_inline(line) - self.add({"type": "paragraph", "content": tokens}) - - flush_bullets() - flush_ordered() - flush_quotes() - else: - # Single line — blockquote, ordered list, or paragraph - if text_content.startswith("> ") or text_content == ">": - quote_text = text_content[2:] if text_content.startswith("> ") else "" - tokens = parse_inline(quote_text) - text_nodes = tokens_to_text_nodes(tokens) - para = {"type": "paragraph", "content": text_nodes} if text_nodes else {"type": "paragraph"} - self.draft_body["content"] = self.draft_body.get("content", []) + [ - {"type": "blockquote", "content": [para]} - ] - - elif re.match(r'^(\d+)\.\s+(.*)', text_content): - ordered_match = re.match(r'^(\d+)\.\s+(.*)', text_content) - item_text = ordered_match.group(2).strip() - tokens = parse_inline(item_text) - text_nodes = tokens_to_text_nodes(tokens) - if text_nodes: - list_item = { - "type": "list_item", - "content": [{"type": "paragraph", "content": text_nodes}], - } - self.draft_body["content"].append( - {"type": "ordered_list", "content": [list_item]} - ) - - else: - tokens = parse_inline(text_content) - self.add({"type": "paragraph", "content": tokens}) - - # Footnotes: turn ``[^label]`` references into inline anchors, then append - # the footnote blocks in numbered order. - if footnote_numbers: - self._inject_footnote_anchors(self.draft_body, footnote_numbers) - for label, number in sorted(footnote_numbers.items(), key=lambda item: item[1]): - self.footnote(number, footnote_definitions[label]) + from substack import mdrender + rendered = mdrender.markdown_to_doc(markdown_content, api=api) + self.draft_body["content"] = self.draft_body.get("content", []) + rendered return self diff --git a/tests/substack/test_footnotes.py b/tests/substack/test_footnotes.py index 172d152..8cde174 100644 --- a/tests/substack/test_footnotes.py +++ b/tests/substack/test_footnotes.py @@ -144,11 +144,18 @@ def test_multiline_definition(self): text = footnotes(post)[0]["content"][0]["content"][0]["text"] assert text == "First line continued on the next line." - def test_unreferenced_definition_still_appended(self): + def test_unreferenced_definition_is_dropped(self): + # CommonMark footnote semantics: a definition that is never referenced is + # not rendered, and must not leak into the body text. post = make_post() post.from_markdown("No references here.\n\n[^1]: Orphan note.") assert len(anchors(post)) == 0 - assert len(footnotes(post)) == 1 + assert len(footnotes(post)) == 0 + paragraphs = find_nodes(post.draft_body, "paragraph") + body_text = " ".join( + n.get("text", "") for para in paragraphs for n in para.get("content", []) + ) + assert "Orphan note" not in body_text def test_reference_without_definition_left_as_text(self): post = make_post() diff --git a/tests/substack/test_from_markdown_features.py b/tests/substack/test_from_markdown_features.py new file mode 100644 index 0000000..7fc345b --- /dev/null +++ b/tests/substack/test_from_markdown_features.py @@ -0,0 +1,149 @@ +"""End-to-end coverage of every feature listed in Post.from_markdown(). + +These exercise the renderer through from_markdown() (as opposed to the +parse_inline() unit tests), so they cover the actual Markdown -> Substack path. +""" + +from substack.post import Post + + +def make_post(): + return Post(title="T", subtitle="S", user_id=1) + + +def body(post): + return post.draft_body["content"] + + +def first_para_nodes(post): + return body(post)[0]["content"] + + +def marked(nodes, text): + """Return the marks on the text node with the given text.""" + node = next(n for n in nodes if n.get("text") == text) + return node.get("marks", []) + + +class TestInlineFormatting: + def test_bold(self): + post = make_post() + post.from_markdown("x **b** y") + assert {"type": "strong"} in marked(first_para_nodes(post), "b") + + def test_italic(self): + post = make_post() + post.from_markdown("x *i* y") + assert {"type": "em"} in marked(first_para_nodes(post), "i") + + def test_bold_italic(self): + post = make_post() + post.from_markdown("***bi***") + marks = marked(first_para_nodes(post), "bi") + assert {"type": "strong"} in marks + assert {"type": "em"} in marks + + def test_inline_code(self): + post = make_post() + post.from_markdown("use `code` now") + assert marked(first_para_nodes(post), "code") == [{"type": "code"}] + + def test_strikethrough(self): + post = make_post() + post.from_markdown("a ~~s~~ b") + assert marked(first_para_nodes(post), "s") == [{"type": "strikethrough"}] + + def test_link(self): + post = make_post() + post.from_markdown("[t](https://e.com)") + assert marked(first_para_nodes(post), "t") == [ + {"type": "link", "attrs": {"href": "https://e.com"}} + ] + + def test_multiple_marks_in_one_paragraph(self): + post = make_post() + post.from_markdown("**b** and *i* and `c` and [l](https://e.com)") + nodes = first_para_nodes(post) + assert {"type": "strong"} in marked(nodes, "b") + assert {"type": "em"} in marked(nodes, "i") + assert marked(nodes, "c") == [{"type": "code"}] + assert marked(nodes, "l")[0]["type"] == "link" + + +class TestBlocks: + def test_all_heading_levels(self): + for level in range(1, 7): + post = make_post() + post.from_markdown("#" * level + " Heading") + block = body(post)[0] + assert block["type"] == "heading" + assert block["attrs"]["level"] == level + + def test_paragraph(self): + post = make_post() + post.from_markdown("Just a plain paragraph.") + block = body(post)[0] + assert block["type"] == "paragraph" + assert block["content"][0]["text"] == "Just a plain paragraph." + + def test_bullet_list(self): + post = make_post() + post.from_markdown("- a\n- b") + block = body(post)[0] + assert block["type"] == "bullet_list" + assert len(block["content"]) == 2 + assert block["content"][0]["type"] == "list_item" + + def test_ordered_list(self): + post = make_post() + post.from_markdown("1. a\n2. b") + block = body(post)[0] + assert block["type"] == "ordered_list" + assert len(block["content"]) == 2 + + def test_code_block_with_language(self): + post = make_post() + post.from_markdown("```python\nprint('hi')\n```") + block = body(post)[0] + assert block["type"] == "codeBlock" + assert block["attrs"]["language"] == "python" + assert block["content"][0]["text"] == "print('hi')" + + def test_code_block_without_language(self): + post = make_post() + post.from_markdown("```\nplain\n```") + block = body(post)[0] + assert block["type"] == "codeBlock" + assert "attrs" not in block or "language" not in block.get("attrs", {}) + + def test_horizontal_rule(self): + post = make_post() + post.from_markdown("a\n\n---\n\nb") + assert [n["type"] for n in body(post)] == ["paragraph", "horizontal_rule", "paragraph"] + + def test_blockquote(self): + post = make_post() + post.from_markdown("> quote") + block = body(post)[0] + assert block["type"] == "blockquote" + assert block["content"][0]["type"] == "paragraph" + + def test_image(self): + post = make_post() + post.from_markdown("![alt](https://example.com/img.png)") + block = body(post)[0] + assert block["type"] == "captionedImage" + assert block["content"][0]["type"] == "image2" + attrs = block["content"][0]["attrs"] + assert attrs["src"] == "https://example.com/img.png" + assert attrs["alt"] == "alt" + + def test_linked_image(self): + post = make_post() + post.from_markdown("[![alt](https://i/x.png)](https://link)") + block = body(post)[0] + assert block["type"] == "captionedImage" + assert block["content"][0]["type"] == "image2" + attrs = block["content"][0]["attrs"] + assert attrs["src"] == "https://i/x.png" + assert attrs["href"] == "https://link" diff --git a/tests/substack/test_post.py b/tests/substack/test_post.py index 701c2a2..c619d88 100644 --- a/tests/substack/test_post.py +++ b/tests/substack/test_post.py @@ -97,15 +97,14 @@ def test_single_blockquote_line(self): assert bq["content"][0]["content"][0]["text"] == "This is a quote" def test_multiline_blockquote_grouped(self): - """Consecutive '>' lines become a single blockquote with multiple paragraphs.""" + """Consecutive '>' lines are one paragraph (CommonMark); blank '>' lines split them.""" post = Post(title="T", subtitle="S", user_id=1) post.from_markdown("> Line one\n> Line two\n> Line three") body = json.loads(post.get_draft()["draft_body"]) bq = body["content"][0] assert bq["type"] == "blockquote" - assert len(bq["content"]) == 3 - texts = [p["content"][0]["text"] for p in bq["content"]] - assert texts == ["Line one", "Line two", "Line three"] + assert len(bq["content"]) == 1 + assert bq["content"][0]["content"][0]["text"] == "Line one Line two Line three" def test_blockquote_separated_by_blank_line(self): """A blank line between '>' groups creates two separate blockquotes."""