From 5ef273d5e4ce8bba775b503c200b628841e2bc6e Mon Sep 17 00:00:00 2001 From: Denny Wong <85551679+denwong47@users.noreply.github.com> Date: Thu, 23 Mar 2023 04:41:05 +0000 Subject: [PATCH] Added escaping to ActiveEnum to allow for non-UAX#31 chars, as well as camel_case related conflicts. (#1374) * Added escaping to ActiveEnum to allow for non-Unicode Standard Annex #31 characters, as well as addressing issues with ' ' and '_' causing potential identifier conflicts. * Improved docstring for camel_case_with_escaped_non_xid. * Moved underscore prepending to camel_case_with_escaped_non_xid. * cargo fmt and code fixes to resolve CI failures. * Added unittest to sea_orm_macros::util for new function. * Fixed a typo in a doc code block. * clippy * Test cases --------- Co-authored-by: Billy Chan --- sea-orm-macros/Cargo.toml | 3 +- sea-orm-macros/src/derives/active_enum.rs | 9 +- sea-orm-macros/src/derives/util.rs | 131 ++++++++++++++++++++++ sea-orm-macros/src/lib.rs | 3 + src/entity/active_enum.rs | 85 ++++++++++++++ 5 files changed, 225 insertions(+), 6 deletions(-) diff --git a/sea-orm-macros/Cargo.toml b/sea-orm-macros/Cargo.toml index 09b0c2d8..e11ea2ba 100644 --- a/sea-orm-macros/Cargo.toml +++ b/sea-orm-macros/Cargo.toml @@ -23,9 +23,10 @@ syn = { version = "1", default-features = false, features = ["parsing", "proc-ma quote = { version = "1", default-features = false } heck = { version = "0.4", default-features = false } proc-macro2 = { version = "1", default-features = false } +unicode-ident = { version = "1" } [dev-dependencies] -sea-orm = { path = "../", features = ["macros"] } +sea-orm = { path = "../", features = ["macros", "tests-cfg"] } serde = { version = "1.0", features = ["derive"] } [features] diff --git a/sea-orm-macros/src/derives/active_enum.rs b/sea-orm-macros/src/derives/active_enum.rs index 1131c211..aa85f8a9 100644 --- a/sea-orm-macros/src/derives/active_enum.rs +++ b/sea-orm-macros/src/derives/active_enum.rs @@ -1,3 +1,4 @@ +use super::util::camel_case_with_escaped_non_uax31; use heck::ToUpperCamelCase; use proc_macro2::TokenStream; use quote::{format_ident, quote, quote_spanned}; @@ -247,11 +248,9 @@ impl ActiveEnum { let enum_variants: Vec = str_variants .iter() .map(|v| { - if v.chars().next().map(char::is_numeric).unwrap_or(false) { - format_ident!("_{}", v) - } else { - format_ident!("{}", v.to_upper_camel_case()) - } + let v_cleaned = camel_case_with_escaped_non_uax31(v); + + format_ident!("{}", v_cleaned) }) .collect(); diff --git a/sea-orm-macros/src/derives/util.rs b/sea-orm-macros/src/derives/util.rs index fd9be823..7de4b79c 100644 --- a/sea-orm-macros/src/derives/util.rs +++ b/sea-orm-macros/src/derives/util.rs @@ -1,3 +1,4 @@ +use heck::ToUpperCamelCase; use quote::format_ident; use syn::{punctuated::Punctuated, token::Comma, Field, Ident, Meta}; @@ -54,6 +55,97 @@ where } } +/// Turn a string to PascalCase while escaping all special characters in ASCII words. +/// +/// (camel_case is used here to match naming of heck.) +/// +/// In ActiveEnum, string_value will be PascalCased and made +/// an identifier in {Enum}Variant. +/// +/// However Rust only allows for XID_Start char followed by +/// XID_Continue characters as identifiers; this causes a few +/// problems: +/// +/// - `string_value = ""` will cause a panic; +/// - `string_value` containing only non-alphanumerics will become `""` +/// and cause the above panic; +/// - `string_values`: +/// - `"A B"` +/// - `"A B"` +/// - `"A_B"` +/// - `"A_ B"` +/// shares the same identifier of `"AB"`; +/// +/// This function does the PascelCase conversion with a few special escapes: +/// - Non-Unicode Standard Annex #31 compliant characters will converted to their hex notation; +/// - `"_"` into `"0x5F"`; +/// - `" "` into `"0x20"`; +/// - Empty strings will become special keyword of `"__Empty"` +/// +/// Note that this does NOT address: +/// +/// - case-sensitivity. String value "ABC" and "abc" remains +/// conflicted after .camel_case(). +/// +/// Example Conversions: +/// +/// ```ignore +/// assert_eq!(camel_case_with_escaped_non_uax31(""), "__Empty"); +/// assert_eq!(camel_case_with_escaped_non_uax31(" "), "_0x20"); +/// assert_eq!(camel_case_with_escaped_non_uax31(" "), "_0x200x20"); +/// assert_eq!(camel_case_with_escaped_non_uax31("_"), "_0x5F"); +/// assert_eq!(camel_case_with_escaped_non_uax31("foobar"), "Foobar"); +/// assert_eq!(camel_case_with_escaped_non_uax31("foo bar"), "Foo0x20bar"); +/// ``` +pub(crate) fn camel_case_with_escaped_non_uax31(string: T) -> String +where + T: ToString, +{ + let additional_chars_to_replace: [char; 2] = ['_', ' ']; + + let mut rebuilt = string + .to_string() + .chars() + .enumerate() + .map(|(pos, char_)| { + if !additional_chars_to_replace.contains(&char_) + && match pos { + 0 => unicode_ident::is_xid_start(char_), + _ => unicode_ident::is_xid_continue(char_), + } + { + char_.to_string() + } else { + format!("{:#X}", char_ as u32) + } + }) + .reduce( + // Join the "characters" (now strings) + // back together + |lhs, rhs| lhs + rhs.as_str(), + ) + .map_or( + // if string_value is "" + // Make sure the default does NOT go through camel_case, + // as the __ will be removed! The underscores are + // what guarantees this being special case avoiding + // all potential conflicts. + String::from("__Empty"), + |s| s.to_upper_camel_case(), + ); + + if rebuilt + .chars() + .next() + .map(char::is_numeric) + .unwrap_or(false) + { + rebuilt = String::from("_") + &rebuilt; + } + + rebuilt +} + pub(crate) const RAW_IDENTIFIER: &str = "r#"; pub(crate) const RUST_KEYWORDS: [&str; 49] = [ @@ -65,3 +157,42 @@ pub(crate) const RUST_KEYWORDS: [&str; 49] = [ ]; pub(crate) const RUST_SPECIAL_KEYWORDS: [&str; 3] = ["crate", "Self", "self"]; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_non_uax31_escape() { + // Test empty string + assert_eq!(camel_case_with_escaped_non_uax31(""), "__Empty"); + + // Test additional_chars_to_replace (to_camel_case related characters) + assert_eq!(camel_case_with_escaped_non_uax31(" "), "_0x20"); + + // Test additional_chars_to_replace (multiples. ensure distinct from single) + assert_eq!(camel_case_with_escaped_non_uax31(" "), "_0x200x20"); + + // Test additional_chars_to_replace (udnerscores) + assert_eq!(camel_case_with_escaped_non_uax31("_"), "_0x5F"); + + // Test typical use case + assert_eq!(camel_case_with_escaped_non_uax31("foobar"), "Foobar"); + + // Test spaced words distinct from non-spaced + assert_eq!(camel_case_with_escaped_non_uax31("foo bar"), "Foo0x20bar"); + + // Test undescored words distinct from non-spaced and spaced + assert_eq!(camel_case_with_escaped_non_uax31("foo_bar"), "Foo0x5Fbar"); + + // Test leading numeric characters + assert_eq!(camel_case_with_escaped_non_uax31("1"), "_0x31"); + + // Test escaping also works on full string following lead numeric character + // This was previously a fail condition. + assert_eq!( + camel_case_with_escaped_non_uax31("1 2 3"), + "_0x310x2020x203" + ); + } +} diff --git a/sea-orm-macros/src/lib.rs b/sea-orm-macros/src/lib.rs index 89c32013..6a795d76 100644 --- a/sea-orm-macros/src/lib.rs +++ b/sea-orm-macros/src/lib.rs @@ -548,6 +548,9 @@ pub fn derive_active_model_behavior(input: TokenStream) -> TokenStream { /// - For enum variant /// - `string_value` or `num_value`: /// - For `string_value`, value should be passed as string, i.e. `string_value = "A"` +/// - Due to the way internal Enums are automatically derived, the following restrictions apply: +/// - members cannot share identical `string_value`, case-insensitive. +/// - in principle, any future Titlecased Rust keywords are not valid `string_value`. /// - For `num_value`, value should be passed as integer, i.e. `num_value = 1` or `num_value = 1i32` /// - Note that only one of it can be specified, and all variants of an enum have to annotate with the same `*_value` macro attribute /// diff --git a/src/entity/active_enum.rs b/src/entity/active_enum.rs index 786653a9..3a89c603 100644 --- a/src/entity/active_enum.rs +++ b/src/entity/active_enum.rs @@ -408,4 +408,89 @@ mod tests { test_fallback_uint!(U32Fallback, u32, "u32", "Integer", Integer); test_fallback_uint!(U64Fallback, u64, "u64", "BigInteger", BigInteger); } + + #[test] + fn escaped_non_uax31() { + #[derive(Debug, Clone, PartialEq, Eq, EnumIter, DeriveActiveEnum, Copy)] + #[sea_orm(rs_type = "String", db_type = "Enum", enum_name = "pop_os_names_typos")] + pub enum PopOSTypos { + #[sea_orm(string_value = "Pop!_OS")] + PopOSCorrect, + #[sea_orm(string_value = "Pop\u{2757}_OS")] + PopOSEmoji, + #[sea_orm(string_value = "Pop!_操作系统")] + PopOSChinese, + #[sea_orm(string_value = "PopOS")] + PopOSASCIIOnly, + #[sea_orm(string_value = "Pop OS")] + PopOSASCIIOnlyWithSpace, + #[sea_orm(string_value = "Pop!OS")] + PopOSNoUnderscore, + #[sea_orm(string_value = "Pop_OS")] + PopOSNoExclaimation, + #[sea_orm(string_value = "!PopOS_")] + PopOSAllOverThePlace, + #[sea_orm(string_value = "Pop!_OS22.04LTS")] + PopOSWithVersion, + #[sea_orm(string_value = "22.04LTSPop!_OS")] + PopOSWithVersionPrefix, + #[sea_orm(string_value = "!_")] + PopOSJustTheSymbols, + #[sea_orm(string_value = "")] + Nothing, + // This WILL fail: + // Both PopOs and PopOS will create identifier "Popos" + // #[sea_orm(string_value = "PopOs")] + // PopOSLowerCase, + } + let values = [ + "Pop!_OS", + "Pop\u{2757}_OS", + "Pop!_操作系统", + "PopOS", + "Pop OS", + "Pop!OS", + "Pop_OS", + "!PopOS_", + "Pop!_OS22.04LTS", + "22.04LTSPop!_OS", + "!_", + "", + ]; + for (variant, val) in PopOSTypos::iter().zip(values) { + assert_eq!(variant.to_value(), val); + assert_eq!(PopOSTypos::try_from_value(&val.to_owned()), Ok(variant)); + } + + #[derive(Clone, Debug, PartialEq, EnumIter, DeriveActiveEnum)] + #[sea_orm( + rs_type = "String", + db_type = "String(None)", + enum_name = "conflicting_string_values" + )] + pub enum ConflictingStringValues { + #[sea_orm(string_value = "")] + Member1, + #[sea_orm(string_value = "$")] + Member2, + #[sea_orm(string_value = "$$")] + Member3, + #[sea_orm(string_value = "AB")] + Member4, + #[sea_orm(string_value = "A_B")] + Member5, + #[sea_orm(string_value = "A$B")] + Member6, + #[sea_orm(string_value = "0 123")] + Member7, + } + type EnumVariant = ConflictingStringValuesVariant; + assert_eq!(EnumVariant::__Empty.to_string(), ""); + assert_eq!(EnumVariant::_0x24.to_string(), "$"); + assert_eq!(EnumVariant::_0x240x24.to_string(), "$$"); + assert_eq!(EnumVariant::Ab.to_string(), "AB"); + assert_eq!(EnumVariant::A0x5Fb.to_string(), "A_B"); + assert_eq!(EnumVariant::A0x24B.to_string(), "A$B"); + assert_eq!(EnumVariant::_0x300x20123.to_string(), "0 123"); + } }