Unverified Commit 1b5453f8 authored by John DiSanti's avatar John DiSanti Committed by GitHub
Browse files

Escape control characters in JSON strings (#427)

* Escape control characters in JSON strings

* CR feedback
parent a5e2c4bb
Loading
Loading
Loading
Loading
+53 −29
Original line number Diff line number Diff line
@@ -5,35 +5,44 @@

use std::borrow::Cow;

const ESCAPES: &[char] = &['"', '\\', '\u{08}', '\u{0C}', '\n', '\r', '\t'];

/// Escapes a string for embedding in a JSON string value.
pub fn escape_string(value: &str) -> Cow<str> {
    if !value.contains(ESCAPES) {
        return Cow::Borrowed(value);
    let bytes = value.as_bytes();
    for (index, byte) in bytes.iter().enumerate() {
        match byte {
            0..=0x1F | b'"' | b'\\' => {
                return Cow::Owned(escape_string_inner(&bytes[0..index], &bytes[index..]))
            }
            _ => {}
        }
    }
    Cow::Borrowed(value)
}

    let mut escaped = String::new();
    let (mut last, end) = (0, value.len());
    for (index, chr) in value
        .char_indices()
        .filter(|(_index, chr)| ESCAPES.contains(chr))
    {
        escaped.push_str(&value[last..index]);
        escaped.push_str(match chr {
            '"' => "\\\"",
            '\\' => "\\\\",
            '\u{08}' => "\\b",
            '\u{0C}' => "\\f",
            '\n' => "\\n",
            '\r' => "\\r",
            '\t' => "\\t",
            _ => unreachable!(),
        });
        last = index + 1;
fn escape_string_inner(start: &[u8], rest: &[u8]) -> String {
    let mut escaped = Vec::with_capacity(start.len() + rest.len() + 1);
    escaped.extend(start);

    for byte in rest {
        match byte {
            b'"' => escaped.extend(b"\\\""),
            b'\\' => escaped.extend(b"\\\\"),
            0x08 => escaped.extend(b"\\b"),
            0x0C => escaped.extend(b"\\f"),
            b'\n' => escaped.extend(b"\\n"),
            b'\r' => escaped.extend(b"\\r"),
            b'\t' => escaped.extend(b"\\t"),
            0..=0x1F => escaped.extend(format!("\\u{:04x}", byte).bytes()),
            _ => escaped.push(*byte),
        }
    }
    escaped.push_str(&value[last..end]);
    Cow::Owned(escaped)

    // This is safe because:
    // - The original input was valid UTF-8 since it came in as a `&str`
    // - Only single-byte code points were escaped
    // - The escape sequences are valid UTF-8
    debug_assert!(std::str::from_utf8(&escaped).is_ok());
    unsafe { String::from_utf8_unchecked(escaped) }
}

#[cfg(test)]
@@ -53,16 +62,31 @@ mod test {
            escape_string("\u{08}f\u{0C}o\to\r\n").as_ref()
        );
        assert_eq!("\\\"test\\\"", escape_string("\"test\"").as_ref());
        assert_eq!("\\u0000", escape_string("\u{0}").as_ref());
        assert_eq!("\\u001f", escape_string("\u{1f}").as_ref());
    }

    use proptest::proptest;
    proptest! {
        #[test]
        fn matches_serde_json(s: String) {
            assert_eq!(
                serde_json::to_string(&s).unwrap(),
                format!(r#""{}""#, escape_string(&s))
            )
        fn matches_serde_json(s in ".*") {
            let serde_escaped = serde_json::to_string(&s).unwrap();
            let serde_escaped = &serde_escaped[1..(serde_escaped.len() - 1)];
            assert_eq!(serde_escaped,escape_string(&s))
        }
    }

    #[test]
    #[ignore] // This tests escaping of all codepoints, but can take a long time in debug builds
    fn all_codepoints() {
        for value in 0..u32::MAX {
            if let Some(chr) = char::from_u32(value) {
                let string = String::from(chr);
                let escaped = escape_string(&string);
                let serde_escaped = serde_json::to_string(&string).unwrap();
                let serde_escaped = &serde_escaped[1..(serde_escaped.len() - 1)];
                assert_eq!(&escaped, serde_escaped);
            }
        }
    }
}