Fix generated AWS readme descriptions (#777) (8b5eb685) · Commits · Public Repositories / Smithy Rust

aws/sdk-codegen/src/main/kotlin/software/amazon/smithy/rustsdk/AwsReadmeDecorator.kt

+179 −8

Original line number	Diff line number	Diff line
		@@ -6,6 +6,8 @@
		package software.amazon.smithy.rustsdk

		import org.jsoup.Jsoup
		import org.jsoup.nodes.Element
		import org.jsoup.nodes.TextNode
		import software.amazon.smithy.model.traits.DocumentationTrait
		import software.amazon.smithy.rust.codegen.rustlang.raw
		import software.amazon.smithy.rust.codegen.smithy.CodegenContext
		@@ -13,6 +15,13 @@ import software.amazon.smithy.rust.codegen.smithy.RustCrate
		import software.amazon.smithy.rust.codegen.smithy.customize.RustCodegenDecorator
		import software.amazon.smithy.rust.codegen.smithy.generators.ManifestCustomizations
		import software.amazon.smithy.rust.codegen.util.getTrait
		import java.lang.StringBuilder
		import java.util.logging.Logger

		// Use a sigil that should always be unique in the text to fix line breaks and spaces
		// since Jsoup doesn't preserve whitespace at all.
		private const val LINE_BREAK_SIGIL = "[[smithy-rs-br]]"
		private const val SPACE_SIGIL = "[[smithy-rs-nbsp]]"

		/**
		* Generates a README.md for each service crate for display on crates.io.
		@@ -21,16 +30,17 @@ class AwsReadmeDecorator : RustCodegenDecorator {
		override val name: String = "AwsReadmeDecorator"
		override val order: Byte = 0

		private val logger: Logger = Logger.getLogger(javaClass.name)

		override fun crateManifestCustomizations(codegenContext: CodegenContext): ManifestCustomizations =
		mapOf("package" to mapOf("readme" to "README.md"))

		override fun extras(codegenContext: CodegenContext, rustCrate: RustCrate) {
		rustCrate.withFile("README.md") { writer ->
		// Strip HTML from the doc trait value. In the future when it's written, we can use our Rustdoc
		// documentation normalization code to convert this to Markdown.
		val description = Jsoup.parse(
		val description = normalizeDescription(
		codegenContext.moduleName,
		codegenContext.settings.getService(codegenContext.model).getTrait<DocumentationTrait>()?.value ?: ""
		).text()
		)
		val moduleName = codegenContext.settings.moduleName

		writer.raw(
		@@ -39,9 +49,9 @@ class AwsReadmeDecorator : RustCodegenDecorator {

		**Please Note: The SDK is currently released as an alpha and is intended strictly for
		feedback purposes only. Do not use this SDK for production workloads.**

		$description

		""".trimIndent() +
		"\n\n$description\n\n" +
		"""
		## Getting Started

		> Examples are availble for many services and operations, check out the
		@@ -78,4 +88,165 @@ class AwsReadmeDecorator : RustCodegenDecorator {
		)
		}
		}

		/**
		* Strips HTML from the description and makes it human-readable Markdown.
		*/
		internal fun normalizeDescription(moduleName: String, input: String): String {
		val doc = Jsoup.parse(input)
		doc.body().apply {
		// The order of operations here is important:
		stripUndesiredNodes() // Remove `<fullname>`, blank whitespace nodes, etc
		normalizeInlineStyles() // Convert bold/italics tags to Markdown equivalents
		normalizeAnchors() // Convert anchor tags into Markdown links
		normalizeBreaks() // Convert `<br>` tags into newlines
		normalizeLists() // Convert HTML lists into Markdown lists
		normalizeDescriptionLists() // Converts HTML <dl> description lists into Markdown
		normalizeParagraphs() // Replace paragraph tags into blocks of text separated by newlines
		warnOnUnrecognizedElements(moduleName) // Log a warning if we missed something
		}
		return doc.body().text()
		.replace(LINE_BREAK_SIGIL, "\n")
		.replace(SPACE_SIGIL, " ")
		.normalizeLineWhitespace()
		}

		private fun Element.stripUndesiredNodes() {
		// Remove the `<fullname>` tag
		getElementsByTag("fullname").forEach { it.remove() }
		// Unwrap `<important>` tags
		getElementsByTag("important").forEach { it.changeInto("span") }
		// Remove the `<note>` tag
		getElementsByTag("note").forEach {
		if (it.children().isEmpty()) {
		throw IllegalStateException("<note> tag unexpectedly had children")
		}
		it.remove()
		}

		// Eliminate empty whitespace
		textNodes().forEach { text ->
		if (text.isBlank) {
		text.remove()
		}
		}
		}

		private fun Element.changeInto(tagName: String) {
		replaceWith(Element(tagName).also { elem -> elem.appendChildren(childNodesCopy()) })
		}

		private fun Element.normalizeInlineStyles() {
		getElementsByTag("b").forEach { normalizeInlineStyleTag("__", it) }
		getElementsByTag("i").forEach { normalizeInlineStyleTag("_", it) }
		}

		private fun normalizeInlineStyleTag(surround: String, tag: Element) {
		tag.replaceWith(
		Element("span").also { span ->
		span.append(surround)
		span.appendChildren(tag.childNodesCopy())
		span.append(surround)
		}
		)
		}

		private fun Element.normalizeAnchors() {
		for (anchor in getElementsByTag("a")) {
		val text = anchor.text()
		val link = anchor.attr("href")
		anchor.replaceWith(
		TextNode(
		if (link.isNotBlank()) {
		"[$text]($link)"
		} else {
		text
		}
		)
		)
		}
		}

		private fun Element.normalizeBreaks() {
		getElementsByTag("br").forEach { lineBreak -> lineBreak.replaceWith(TextNode(LINE_BREAK_SIGIL)) }
		}

		private fun Element.isList(): Boolean = tagName() == "ul" \|\| tagName() == "ol"

		private fun Element.normalizeLists() {
		(getElementsByTag("ul") + getElementsByTag("ol"))
		// Only operate on lists that are top-level (are not nested within other lists)
		.filter { list -> list.parents().none() { it.isList() } }
		.forEach { list -> list.normalizeList() }
		}

		private fun Element.normalizeList(indent: Int = 1) {
		// First, replace nested lists
		for (child in children().filter { it.tagName() == "li" }) {
		for (itemChild in child.children()) {
		if (itemChild.isList()) {
		itemChild.normalizeList(indent + 1)
		}
		}
		}
		// Then format the list items down to Markdown
		val result = StringBuilder(if (indent == 1) "" else LINE_BREAK_SIGIL)
		val prefix = if (tagName() == "ul") "- " else "1. "
		val indentText = SPACE_SIGIL.repeat(indent * 2)
		for (child in children()) {
		result.append("$indentText$prefix${child.text().trim()}$LINE_BREAK_SIGIL")
		}
		replaceWith(TextNode(result.toString()))
		}

		private fun Element.normalizeDescriptionLists() {
		getElementsByTag("dl").forEach { list -> list.normalizeDescriptionList() }
		}

		private fun Element.normalizeDescriptionList() {
		getElementsByTag("dt").forEach { dt ->
		dt.text("${LINE_BREAK_SIGIL}__${dt.text()}__$LINE_BREAK_SIGIL")
		dt.changeInto("span")
		}
		getElementsByTag("dd").forEach { dd -> dd.changeInto("p") }
		appendChild(TextNode(LINE_BREAK_SIGIL))
		changeInto("span")
		}

		private fun Element.normalizeParagraphs() {
		getElementsByTag("p").forEach { paragraph ->
		paragraph.replaceWith(TextNode(LINE_BREAK_SIGIL + paragraph.text() + LINE_BREAK_SIGIL))
		}
		}

		private fun Element.warnOnUnrecognizedElements(moduleName: String) {
		allElements
		.filter { elem ->
		// body is always present
		elem.tagName() != "body" &&
		// we replace certain elements with span, so these are fine
		elem.tagName() != "span"
		}
		.map { elem -> elem.tagName() }.toSortedSet().joinToString(", ")
		.let { tags ->
		if (tags.isNotEmpty()) {
		logger.warning { "[$moduleName] Unrecognized HTML tags encountered when normalizing text: $tags" }
		}
		}
		}

		private fun String.normalizeLineWhitespace(): String =
		// Convert sigils back into whitespace
		replace(LINE_BREAK_SIGIL, "\n")
		.replace(SPACE_SIGIL, " ")
		// Replace long runs of linebreaks with just two line breaks
		.replace(Regex("\n\n\n+"), "\n\n")
		// Remove trailing whitespace from each line
		.replace(Regex("[ \t]+\n"), "\n")
		// Remove leading whitespace from each line when it's not a list item
		.replace(Regex("\n[ \t]+([^ \t\\-1])"), "\n$1")
		// Chop off leading newlines
		.replace(Regex("^\n+"), "")
		// Chop off trailing newlines
		.replace(Regex("\n+$"), "")
		}

aws/sdk-codegen/src/test/kotlin/AwsReadmeDecoratorTest.kt

0 → 100644

+105 −0

Original line number	Diff line number	Diff line
		/*
		* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
		* SPDX-License-Identifier: Apache-2.0.
		*/

		import org.junit.jupiter.api.Assertions.assertEquals
		import org.junit.jupiter.api.Test
		import software.amazon.smithy.rustsdk.AwsReadmeDecorator

		class AwsReadmeDecoratorTest {
		@Test
		fun `it converts description HTML into Markdown`() {
		assertEquals(
		"""
		This is __some paragraph__ of information.

		This is _another_ paragraph of information.

		More information [can be found here](https://example.com).
		""".trimIndent(),
		AwsReadmeDecorator().normalizeDescription(
		"",
		"""
		<fullname>Some service</fullname>
		<p>This is <b>some paragraph</b>
		of information.</p>
		<p>This is <i>another</i> paragraph
		of information.</p>
		<p>More information <a href="https://example.com">can be found here</a>.</p>
		""".trimIndent()
		)
		)
		}

		@Test
		fun `it converts lists`() {
		assertEquals(
		"""
		Some text introducing a list:
		- foo bar baz
		- baz bar foo
		1. nested item
		1. another

		More text.
		""".trimIndent(),
		AwsReadmeDecorator().normalizeDescription(
		"",
		"""
		<p>Some text introducing a list:
		<ul>
		<li>foo bar baz</li>
		<li>baz bar foo
		<ol><li>nested item</li><li>another</li></ol>
		</li>
		</ul> More text.</p>
		""".trimIndent()
		)
		)
		}

		@Test
		fun `it converts description lists`() {
		assertEquals(
		"""
		Some text introducing a description list:

		__Something__

		Some description of [something](test).

		__Another thing__

		Some description of another thing.

		A second paragraph that describes another thing.

		__MDN says these can be wrapped in divs__

		So here we are

		Some trailing text.
		""".trimIndent(),
		AwsReadmeDecorator().normalizeDescription(
		"",
		"""
		<p>Some text introducing a description list:
		<dl>
		<dt>Something</dt>
		<dd>Some description of <a href="test">something</a>.</dd>
		<dt>Another thing</dt>
		<dd>Some description of another thing.</dd>
		<dd>A second paragraph that describes another thing.</dd>
		<div>
		<dt>MDN says these can be wrapped in divs</dt>
		<dd>So here we are</dd>
		</div>
		</dl>
		Some trailing text.
		</p>
		""".trimIndent()
		)
		)
		}
		}

Admin message