Unverified Commit 8b5eb685 authored by John DiSanti's avatar John DiSanti Committed by GitHub
Browse files

Fix generated AWS readme descriptions (#777)

* Fix generated AWS readme descriptions

* Handle `important` and description list tags
parent dbc5401c
Loading
Loading
Loading
Loading
+179 −8
Original line number Diff line number Diff line
@@ -6,6 +6,8 @@
package software.amazon.smithy.rustsdk

import org.jsoup.Jsoup
import org.jsoup.nodes.Element
import org.jsoup.nodes.TextNode
import software.amazon.smithy.model.traits.DocumentationTrait
import software.amazon.smithy.rust.codegen.rustlang.raw
import software.amazon.smithy.rust.codegen.smithy.CodegenContext
@@ -13,6 +15,13 @@ import software.amazon.smithy.rust.codegen.smithy.RustCrate
import software.amazon.smithy.rust.codegen.smithy.customize.RustCodegenDecorator
import software.amazon.smithy.rust.codegen.smithy.generators.ManifestCustomizations
import software.amazon.smithy.rust.codegen.util.getTrait
import java.lang.StringBuilder
import java.util.logging.Logger

// Use a sigil that should always be unique in the text to fix line breaks and spaces
// since Jsoup doesn't preserve whitespace at all.
private const val LINE_BREAK_SIGIL = "[[smithy-rs-br]]"
private const val SPACE_SIGIL = "[[smithy-rs-nbsp]]"

/**
 * Generates a README.md for each service crate for display on crates.io.
@@ -21,16 +30,17 @@ class AwsReadmeDecorator : RustCodegenDecorator {
    override val name: String = "AwsReadmeDecorator"
    override val order: Byte = 0

    private val logger: Logger = Logger.getLogger(javaClass.name)

    override fun crateManifestCustomizations(codegenContext: CodegenContext): ManifestCustomizations =
        mapOf("package" to mapOf("readme" to "README.md"))

    override fun extras(codegenContext: CodegenContext, rustCrate: RustCrate) {
        rustCrate.withFile("README.md") { writer ->
            // Strip HTML from the doc trait value. In the future when it's written, we can use our Rustdoc
            // documentation normalization code to convert this to Markdown.
            val description = Jsoup.parse(
            val description = normalizeDescription(
                codegenContext.moduleName,
                codegenContext.settings.getService(codegenContext.model).getTrait<DocumentationTrait>()?.value ?: ""
            ).text()
            )
            val moduleName = codegenContext.settings.moduleName

            writer.raw(
@@ -39,9 +49,9 @@ class AwsReadmeDecorator : RustCodegenDecorator {

                **Please Note: The SDK is currently released as an alpha and is intended strictly for
                feedback purposes only. Do not use this SDK for production workloads.**

                $description

                """.trimIndent() +
                    "\n\n$description\n\n" +
                    """
                ## Getting Started

                > Examples are availble for many services and operations, check out the
@@ -78,4 +88,165 @@ class AwsReadmeDecorator : RustCodegenDecorator {
            )
        }
    }

    /**
     * Strips HTML from the description and makes it human-readable Markdown.
     */
    internal fun normalizeDescription(moduleName: String, input: String): String {
        val doc = Jsoup.parse(input)
        doc.body().apply {
            // The order of operations here is important:
            stripUndesiredNodes() // Remove `<fullname>`, blank whitespace nodes, etc
            normalizeInlineStyles() // Convert bold/italics tags to Markdown equivalents
            normalizeAnchors() // Convert anchor tags into Markdown links
            normalizeBreaks() // Convert `<br>` tags into newlines
            normalizeLists() // Convert HTML lists into Markdown lists
            normalizeDescriptionLists() // Converts HTML <dl> description lists into Markdown
            normalizeParagraphs() // Replace paragraph tags into blocks of text separated by newlines
            warnOnUnrecognizedElements(moduleName) // Log a warning if we missed something
        }
        return doc.body().text()
            .replace(LINE_BREAK_SIGIL, "\n")
            .replace(SPACE_SIGIL, " ")
            .normalizeLineWhitespace()
    }

    private fun Element.stripUndesiredNodes() {
        // Remove the `<fullname>` tag
        getElementsByTag("fullname").forEach { it.remove() }
        // Unwrap `<important>` tags
        getElementsByTag("important").forEach { it.changeInto("span") }
        // Remove the `<note>` tag
        getElementsByTag("note").forEach {
            if (it.children().isEmpty()) {
                throw IllegalStateException("<note> tag unexpectedly had children")
            }
            it.remove()
        }

        // Eliminate empty whitespace
        textNodes().forEach { text ->
            if (text.isBlank) {
                text.remove()
            }
        }
    }

    private fun Element.changeInto(tagName: String) {
        replaceWith(Element(tagName).also { elem -> elem.appendChildren(childNodesCopy()) })
    }

    private fun Element.normalizeInlineStyles() {
        getElementsByTag("b").forEach { normalizeInlineStyleTag("__", it) }
        getElementsByTag("i").forEach { normalizeInlineStyleTag("_", it) }
    }

    private fun normalizeInlineStyleTag(surround: String, tag: Element) {
        tag.replaceWith(
            Element("span").also { span ->
                span.append(surround)
                span.appendChildren(tag.childNodesCopy())
                span.append(surround)
            }
        )
    }

    private fun Element.normalizeAnchors() {
        for (anchor in getElementsByTag("a")) {
            val text = anchor.text()
            val link = anchor.attr("href")
            anchor.replaceWith(
                TextNode(
                    if (link.isNotBlank()) {
                        "[$text]($link)"
                    } else {
                        text
                    }
                )
            )
        }
    }

    private fun Element.normalizeBreaks() {
        getElementsByTag("br").forEach { lineBreak -> lineBreak.replaceWith(TextNode(LINE_BREAK_SIGIL)) }
    }

    private fun Element.isList(): Boolean = tagName() == "ul" || tagName() == "ol"

    private fun Element.normalizeLists() {
        (getElementsByTag("ul") + getElementsByTag("ol"))
            // Only operate on lists that are top-level (are not nested within other lists)
            .filter { list -> list.parents().none() { it.isList() } }
            .forEach { list -> list.normalizeList() }
    }

    private fun Element.normalizeList(indent: Int = 1) {
        // First, replace nested lists
        for (child in children().filter { it.tagName() == "li" }) {
            for (itemChild in child.children()) {
                if (itemChild.isList()) {
                    itemChild.normalizeList(indent + 1)
                }
            }
        }
        // Then format the list items down to Markdown
        val result = StringBuilder(if (indent == 1) "" else LINE_BREAK_SIGIL)
        val prefix = if (tagName() == "ul") "- " else "1. "
        val indentText = SPACE_SIGIL.repeat(indent * 2)
        for (child in children()) {
            result.append("$indentText$prefix${child.text().trim()}$LINE_BREAK_SIGIL")
        }
        replaceWith(TextNode(result.toString()))
    }

    private fun Element.normalizeDescriptionLists() {
        getElementsByTag("dl").forEach { list -> list.normalizeDescriptionList() }
    }

    private fun Element.normalizeDescriptionList() {
        getElementsByTag("dt").forEach { dt ->
            dt.text("${LINE_BREAK_SIGIL}__${dt.text()}__$LINE_BREAK_SIGIL")
            dt.changeInto("span")
        }
        getElementsByTag("dd").forEach { dd -> dd.changeInto("p") }
        appendChild(TextNode(LINE_BREAK_SIGIL))
        changeInto("span")
    }

    private fun Element.normalizeParagraphs() {
        getElementsByTag("p").forEach { paragraph ->
            paragraph.replaceWith(TextNode(LINE_BREAK_SIGIL + paragraph.text() + LINE_BREAK_SIGIL))
        }
    }

    private fun Element.warnOnUnrecognizedElements(moduleName: String) {
        allElements
            .filter { elem ->
                // body is always present
                elem.tagName() != "body" &&
                    // we replace certain elements with span, so these are fine
                    elem.tagName() != "span"
            }
            .map { elem -> elem.tagName() }.toSortedSet().joinToString(", ")
            .let { tags ->
                if (tags.isNotEmpty()) {
                    logger.warning { "[$moduleName] Unrecognized HTML tags encountered when normalizing text: $tags" }
                }
            }
    }

    private fun String.normalizeLineWhitespace(): String =
        // Convert sigils back into whitespace
        replace(LINE_BREAK_SIGIL, "\n")
            .replace(SPACE_SIGIL, " ")
            // Replace long runs of linebreaks with just two line breaks
            .replace(Regex("\n\n\n+"), "\n\n")
            // Remove trailing whitespace from each line
            .replace(Regex("[ \t]+\n"), "\n")
            // Remove leading whitespace from each line when it's not a list item
            .replace(Regex("\n[ \t]+([^ \t\\-1])"), "\n$1")
            // Chop off leading newlines
            .replace(Regex("^\n+"), "")
            // Chop off trailing newlines
            .replace(Regex("\n+$"), "")
}
+105 −0
Original line number Diff line number Diff line
/*
 * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 * SPDX-License-Identifier: Apache-2.0.
 */

import org.junit.jupiter.api.Assertions.assertEquals
import org.junit.jupiter.api.Test
import software.amazon.smithy.rustsdk.AwsReadmeDecorator

class AwsReadmeDecoratorTest {
    @Test
    fun `it converts description HTML into Markdown`() {
        assertEquals(
            """
            This is __some paragraph__ of information.

            This is _another_ paragraph of information.

            More information [can be found here](https://example.com).
            """.trimIndent(),
            AwsReadmeDecorator().normalizeDescription(
                "",
                """
                <fullname>Some service</fullname>
                <p>This is <b>some paragraph</b>
                of information.</p>
                <p>This is <i>another</i> paragraph
                of information.</p>
                <p>More information <a href="https://example.com">can be found here</a>.</p>
                """.trimIndent()
            )
        )
    }

    @Test
    fun `it converts lists`() {
        assertEquals(
            """
            Some text introducing a list:
              - foo bar baz
              - baz bar foo
                1. nested item
                1. another

            More text.
            """.trimIndent(),
            AwsReadmeDecorator().normalizeDescription(
                "",
                """
                <p>Some text introducing a list:
                <ul>
                  <li>foo bar baz</li>
                  <li>baz bar foo
                    <ol><li>nested item</li><li>another</li></ol>
                  </li>
                </ul> More text.</p>
                """.trimIndent()
            )
        )
    }

    @Test
    fun `it converts description lists`() {
        assertEquals(
            """
            Some text introducing a description list:

            __Something__

            Some description of [something](test).

            __Another thing__

            Some description of another thing.

            A second paragraph that describes another thing.

            __MDN says these can be wrapped in divs__

            So here we are

            Some trailing text.
            """.trimIndent(),
            AwsReadmeDecorator().normalizeDescription(
                "",
                """
                <p>Some text introducing a description list:
                <dl>
                  <dt>Something</dt>
                  <dd>Some description of <a href="test">something</a>.</dd>
                  <dt>Another thing</dt>
                  <dd>Some description of another thing.</dd>
                  <dd>A second paragraph that describes another thing.</dd>
                  <div>
                      <dt>MDN says these can be wrapped in divs</dt>
                      <dd>So here we are</dd>
                  </div>
                </dl>
                Some trailing text.
                </p>
                """.trimIndent()
            )
        )
    }
}