Added escaping to ActiveEnum to allow for non-UAX#31 chars, as well as camel_case related conflicts. (#1374)

* Added escaping to ActiveEnum to allow for non-Unicode Standard Annex #31 characters, as well as addressing issues with ' ' and '_' causing potential identifier conflicts.

* Improved docstring for camel_case_with_escaped_non_xid.

* Moved underscore prepending to camel_case_with_escaped_non_xid.

* cargo fmt and code fixes to resolve CI failures.

* Added unittest to sea_orm_macros::util for new function.

* Fixed a typo in a doc code block.

* clippy

* Test cases

---------

Co-authored-by: Billy Chan <ccw.billy.123@gmail.com>
This commit is contained in:
Denny Wong 2023-03-23 04:41:05 +00:00 committed by GitHub
parent 16015cc35d
commit 5ef273d5e4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 225 additions and 6 deletions

View File

@ -23,9 +23,10 @@ syn = { version = "1", default-features = false, features = ["parsing", "proc-ma
quote = { version = "1", default-features = false } quote = { version = "1", default-features = false }
heck = { version = "0.4", default-features = false } heck = { version = "0.4", default-features = false }
proc-macro2 = { version = "1", default-features = false } proc-macro2 = { version = "1", default-features = false }
unicode-ident = { version = "1" }
[dev-dependencies] [dev-dependencies]
sea-orm = { path = "../", features = ["macros"] } sea-orm = { path = "../", features = ["macros", "tests-cfg"] }
serde = { version = "1.0", features = ["derive"] } serde = { version = "1.0", features = ["derive"] }
[features] [features]

View File

@ -1,3 +1,4 @@
use super::util::camel_case_with_escaped_non_uax31;
use heck::ToUpperCamelCase; use heck::ToUpperCamelCase;
use proc_macro2::TokenStream; use proc_macro2::TokenStream;
use quote::{format_ident, quote, quote_spanned}; use quote::{format_ident, quote, quote_spanned};
@ -247,11 +248,9 @@ impl ActiveEnum {
let enum_variants: Vec<syn::Ident> = str_variants let enum_variants: Vec<syn::Ident> = str_variants
.iter() .iter()
.map(|v| { .map(|v| {
if v.chars().next().map(char::is_numeric).unwrap_or(false) { let v_cleaned = camel_case_with_escaped_non_uax31(v);
format_ident!("_{}", v)
} else { format_ident!("{}", v_cleaned)
format_ident!("{}", v.to_upper_camel_case())
}
}) })
.collect(); .collect();

View File

@ -1,3 +1,4 @@
use heck::ToUpperCamelCase;
use quote::format_ident; use quote::format_ident;
use syn::{punctuated::Punctuated, token::Comma, Field, Ident, Meta}; use syn::{punctuated::Punctuated, token::Comma, Field, Ident, Meta};
@ -54,6 +55,97 @@ where
} }
} }
/// Turn a string to PascalCase while escaping all special characters in ASCII words.
///
/// (camel_case is used here to match naming of heck.)
///
/// In ActiveEnum, string_value will be PascalCased and made
/// an identifier in {Enum}Variant.
///
/// However Rust only allows for XID_Start char followed by
/// XID_Continue characters as identifiers; this causes a few
/// problems:
///
/// - `string_value = ""` will cause a panic;
/// - `string_value` containing only non-alphanumerics will become `""`
/// and cause the above panic;
/// - `string_values`:
/// - `"A B"`
/// - `"A B"`
/// - `"A_B"`
/// - `"A_ B"`
/// shares the same identifier of `"AB"`;
///
/// This function does the PascelCase conversion with a few special escapes:
/// - Non-Unicode Standard Annex #31 compliant characters will converted to their hex notation;
/// - `"_"` into `"0x5F"`;
/// - `" "` into `"0x20"`;
/// - Empty strings will become special keyword of `"__Empty"`
///
/// Note that this does NOT address:
///
/// - case-sensitivity. String value "ABC" and "abc" remains
/// conflicted after .camel_case().
///
/// Example Conversions:
///
/// ```ignore
/// assert_eq!(camel_case_with_escaped_non_uax31(""), "__Empty");
/// assert_eq!(camel_case_with_escaped_non_uax31(" "), "_0x20");
/// assert_eq!(camel_case_with_escaped_non_uax31(" "), "_0x200x20");
/// assert_eq!(camel_case_with_escaped_non_uax31("_"), "_0x5F");
/// assert_eq!(camel_case_with_escaped_non_uax31("foobar"), "Foobar");
/// assert_eq!(camel_case_with_escaped_non_uax31("foo bar"), "Foo0x20bar");
/// ```
pub(crate) fn camel_case_with_escaped_non_uax31<T>(string: T) -> String
where
T: ToString,
{
let additional_chars_to_replace: [char; 2] = ['_', ' '];
let mut rebuilt = string
.to_string()
.chars()
.enumerate()
.map(|(pos, char_)| {
if !additional_chars_to_replace.contains(&char_)
&& match pos {
0 => unicode_ident::is_xid_start(char_),
_ => unicode_ident::is_xid_continue(char_),
}
{
char_.to_string()
} else {
format!("{:#X}", char_ as u32)
}
})
.reduce(
// Join the "characters" (now strings)
// back together
|lhs, rhs| lhs + rhs.as_str(),
)
.map_or(
// if string_value is ""
// Make sure the default does NOT go through camel_case,
// as the __ will be removed! The underscores are
// what guarantees this being special case avoiding
// all potential conflicts.
String::from("__Empty"),
|s| s.to_upper_camel_case(),
);
if rebuilt
.chars()
.next()
.map(char::is_numeric)
.unwrap_or(false)
{
rebuilt = String::from("_") + &rebuilt;
}
rebuilt
}
pub(crate) const RAW_IDENTIFIER: &str = "r#"; pub(crate) const RAW_IDENTIFIER: &str = "r#";
pub(crate) const RUST_KEYWORDS: [&str; 49] = [ pub(crate) const RUST_KEYWORDS: [&str; 49] = [
@ -65,3 +157,42 @@ pub(crate) const RUST_KEYWORDS: [&str; 49] = [
]; ];
pub(crate) const RUST_SPECIAL_KEYWORDS: [&str; 3] = ["crate", "Self", "self"]; pub(crate) const RUST_SPECIAL_KEYWORDS: [&str; 3] = ["crate", "Self", "self"];
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_non_uax31_escape() {
// Test empty string
assert_eq!(camel_case_with_escaped_non_uax31(""), "__Empty");
// Test additional_chars_to_replace (to_camel_case related characters)
assert_eq!(camel_case_with_escaped_non_uax31(" "), "_0x20");
// Test additional_chars_to_replace (multiples. ensure distinct from single)
assert_eq!(camel_case_with_escaped_non_uax31(" "), "_0x200x20");
// Test additional_chars_to_replace (udnerscores)
assert_eq!(camel_case_with_escaped_non_uax31("_"), "_0x5F");
// Test typical use case
assert_eq!(camel_case_with_escaped_non_uax31("foobar"), "Foobar");
// Test spaced words distinct from non-spaced
assert_eq!(camel_case_with_escaped_non_uax31("foo bar"), "Foo0x20bar");
// Test undescored words distinct from non-spaced and spaced
assert_eq!(camel_case_with_escaped_non_uax31("foo_bar"), "Foo0x5Fbar");
// Test leading numeric characters
assert_eq!(camel_case_with_escaped_non_uax31("1"), "_0x31");
// Test escaping also works on full string following lead numeric character
// This was previously a fail condition.
assert_eq!(
camel_case_with_escaped_non_uax31("1 2 3"),
"_0x310x2020x203"
);
}
}

View File

@ -548,6 +548,9 @@ pub fn derive_active_model_behavior(input: TokenStream) -> TokenStream {
/// - For enum variant /// - For enum variant
/// - `string_value` or `num_value`: /// - `string_value` or `num_value`:
/// - For `string_value`, value should be passed as string, i.e. `string_value = "A"` /// - For `string_value`, value should be passed as string, i.e. `string_value = "A"`
/// - Due to the way internal Enums are automatically derived, the following restrictions apply:
/// - members cannot share identical `string_value`, case-insensitive.
/// - in principle, any future Titlecased Rust keywords are not valid `string_value`.
/// - For `num_value`, value should be passed as integer, i.e. `num_value = 1` or `num_value = 1i32` /// - For `num_value`, value should be passed as integer, i.e. `num_value = 1` or `num_value = 1i32`
/// - Note that only one of it can be specified, and all variants of an enum have to annotate with the same `*_value` macro attribute /// - Note that only one of it can be specified, and all variants of an enum have to annotate with the same `*_value` macro attribute
/// ///

View File

@ -408,4 +408,89 @@ mod tests {
test_fallback_uint!(U32Fallback, u32, "u32", "Integer", Integer); test_fallback_uint!(U32Fallback, u32, "u32", "Integer", Integer);
test_fallback_uint!(U64Fallback, u64, "u64", "BigInteger", BigInteger); test_fallback_uint!(U64Fallback, u64, "u64", "BigInteger", BigInteger);
} }
#[test]
fn escaped_non_uax31() {
#[derive(Debug, Clone, PartialEq, Eq, EnumIter, DeriveActiveEnum, Copy)]
#[sea_orm(rs_type = "String", db_type = "Enum", enum_name = "pop_os_names_typos")]
pub enum PopOSTypos {
#[sea_orm(string_value = "Pop!_OS")]
PopOSCorrect,
#[sea_orm(string_value = "Pop\u{2757}_OS")]
PopOSEmoji,
#[sea_orm(string_value = "Pop!_操作系统")]
PopOSChinese,
#[sea_orm(string_value = "PopOS")]
PopOSASCIIOnly,
#[sea_orm(string_value = "Pop OS")]
PopOSASCIIOnlyWithSpace,
#[sea_orm(string_value = "Pop!OS")]
PopOSNoUnderscore,
#[sea_orm(string_value = "Pop_OS")]
PopOSNoExclaimation,
#[sea_orm(string_value = "!PopOS_")]
PopOSAllOverThePlace,
#[sea_orm(string_value = "Pop!_OS22.04LTS")]
PopOSWithVersion,
#[sea_orm(string_value = "22.04LTSPop!_OS")]
PopOSWithVersionPrefix,
#[sea_orm(string_value = "!_")]
PopOSJustTheSymbols,
#[sea_orm(string_value = "")]
Nothing,
// This WILL fail:
// Both PopOs and PopOS will create identifier "Popos"
// #[sea_orm(string_value = "PopOs")]
// PopOSLowerCase,
}
let values = [
"Pop!_OS",
"Pop\u{2757}_OS",
"Pop!_操作系统",
"PopOS",
"Pop OS",
"Pop!OS",
"Pop_OS",
"!PopOS_",
"Pop!_OS22.04LTS",
"22.04LTSPop!_OS",
"!_",
"",
];
for (variant, val) in PopOSTypos::iter().zip(values) {
assert_eq!(variant.to_value(), val);
assert_eq!(PopOSTypos::try_from_value(&val.to_owned()), Ok(variant));
}
#[derive(Clone, Debug, PartialEq, EnumIter, DeriveActiveEnum)]
#[sea_orm(
rs_type = "String",
db_type = "String(None)",
enum_name = "conflicting_string_values"
)]
pub enum ConflictingStringValues {
#[sea_orm(string_value = "")]
Member1,
#[sea_orm(string_value = "$")]
Member2,
#[sea_orm(string_value = "$$")]
Member3,
#[sea_orm(string_value = "AB")]
Member4,
#[sea_orm(string_value = "A_B")]
Member5,
#[sea_orm(string_value = "A$B")]
Member6,
#[sea_orm(string_value = "0 123")]
Member7,
}
type EnumVariant = ConflictingStringValuesVariant;
assert_eq!(EnumVariant::__Empty.to_string(), "");
assert_eq!(EnumVariant::_0x24.to_string(), "$");
assert_eq!(EnumVariant::_0x240x24.to_string(), "$$");
assert_eq!(EnumVariant::Ab.to_string(), "AB");
assert_eq!(EnumVariant::A0x5Fb.to_string(), "A_B");
assert_eq!(EnumVariant::A0x24B.to_string(), "A$B");
assert_eq!(EnumVariant::_0x300x20123.to_string(), "0 123");
}
} }