Unverified Commit 59a47833 authored by Russell Cohen's avatar Russell Cohen Committed by GitHub
Browse files

First pass at optimizing SplitOnWordBoundaries (#3140)

## Motivation and Context
@Velfi observed this function consuming a significant amount of codegen
time.

## Description
Optimize split on word boundaries, primarily by avoiding recomputation
of `isStartOfWord` when we already know it isn't the start of a word.

## Testing
- Original: 3.4s. Updated: 688ms
- Existing tests are exhaustive

On a `:aws:sdk:assemble` for the smoke test services, this improved from
53 seconds to 42 seconds
----

_By submitting this pull request, I confirm that you can use, modify,
copy, and redistribute this contribution, under the terms of your
choice._
parent c8edefee
Loading
Loading
Loading
Loading
+17 −10
Original line number Diff line number Diff line
@@ -19,13 +19,16 @@ fun String.doubleQuote(): String =
fun String.dq(): String = this.doubleQuote()

private val completeWords: List<String> = listOf("ipv4", "ipv6", "sigv4", "mib", "gib", "kib", "ttl")

private fun String.splitOnWordBoundaries(): List<String> {
    val out = mutableListOf<String>()
    // These are whole words but cased differently, e.g. `IPv4`, `MiB`, `GiB`, `TtL`
    var currentWord = ""

    var completeWordInProgress = true
    // emit the current word and update from the next character
    val emit = { next: Char ->
        completeWordInProgress = true
        if (currentWord.isNotEmpty()) {
            out += currentWord.lowercase()
        }
@@ -37,13 +40,17 @@ private fun String.splitOnWordBoundaries(): List<String> {
    }
    val allLowerCase = this.lowercase() == this
    this.forEachIndexed { index, nextCharacter ->
        val peek = this.getOrNull(index + 1)
        val doublePeek = this.getOrNull(index + 2)
        val completeWordInProgress = completeWords.any {
            (currentWord + this.substring(index)).lowercase().startsWith(
        val computeWordInProgress = {
            val result = completeWordInProgress && currentWord.isNotEmpty() && completeWords.any {
                it.startsWith(currentWord, ignoreCase = true) && (currentWord + this.substring(index)).startsWith(
                    it,
            )
        } && !completeWords.contains(currentWord.lowercase())
                    ignoreCase = true,
                ) && !it.equals(currentWord, ignoreCase = true)
            }

            completeWordInProgress = result
            result
        }
        when {
            // [C] in these docs indicates the value of nextCharacter
            // A[_]B
@@ -53,15 +60,15 @@ private fun String.splitOnWordBoundaries(): List<String> {
            currentWord.isEmpty() -> currentWord += nextCharacter.toString()

            // Abc[D]ef or Ab2[D]ef
            !completeWordInProgress && loweredFollowedByUpper(currentWord, nextCharacter) -> emit(nextCharacter)
            !computeWordInProgress() && loweredFollowedByUpper(currentWord, nextCharacter) -> emit(nextCharacter)

            // s3[k]ey
            !completeWordInProgress && allLowerCase && digitFollowedByLower(currentWord, nextCharacter) -> emit(
            !computeWordInProgress() && allLowerCase && digitFollowedByLower(currentWord, nextCharacter) -> emit(
                nextCharacter,
            )

            // DB[P]roxy, or `IAM[U]ser` but not AC[L]s
            endOfAcronym(currentWord, nextCharacter, peek, doublePeek) -> emit(nextCharacter)
            endOfAcronym(currentWord, nextCharacter, this.getOrNull(index + 1), this.getOrNull(index + 2)) -> emit(nextCharacter)

            // If we haven't found a word boundary, push it and keep going
            else -> currentWord += nextCharacter.toString()