pub(crate) fn split_words(s: &str) -> Vec<&str> {
let mut words = vec![];
let mut word_start: Option<usize> = None;
for (idx, c) in s.char_indices() {
match word_start {
Some(start) => {
if c.is_alphanumeric() || c == '-' || c == '_' {
} else {
words.push(&s[start..idx]);
words.push(&s[idx..idx + c.len_utf8()]);
word_start = None;
}
}
None => {
if c.is_alphanumeric() || c == '-' || c == '_' {
word_start = Some(idx);
} else {
words.push(&s[idx..idx + c.len_utf8()]);
}
}
}
}
if let Some(start) = word_start {
words.push(&s[start..]);
}
words
}
pub(crate) fn split_words_and_numbers(s: &str) -> Vec<&str> {
let mut words = vec![];
let mut word_start: Option<(usize, char)> = None;
for (idx, c) in s.char_indices() {
match word_start {
Some((start, start_c)) => {
if c.is_alphanumeric() || c == '_' {
if c.is_ascii_digit() == start_c.is_ascii_digit() {
} else {
words.push(&s[start..idx]);
word_start = Some((idx, c));
}
} else {
words.push(&s[start..idx]);
words.push(&s[idx..idx + c.len_utf8()]);
word_start = None;
}
}
None => {
if c.is_alphanumeric() || c == '-' || c == '_' {
word_start = Some((idx, c));
} else {
words.push(&s[idx..idx + c.len_utf8()]);
}
}
}
}
if let Some((start, _)) = word_start {
words.push(&s[start..]);
}
words
}
#[cfg(test)]
mod tests {
use pretty_assertions::assert_eq;
use super::*;
#[test]
fn test_split_words() {
let s = "example.com";
let res = split_words(s);
assert_eq!(res, vec!["example", ".", "com"])
}
#[test]
fn test_split_words_punctuation() {
let s = "example..";
let res = split_words(s);
assert_eq!(res, vec!["example", ".", "."])
}
#[test]
fn test_split_words_numbers() {
let s = "foo123bar";
let res = split_words(s);
assert_eq!(res, vec!["foo123bar"])
}
#[test]
fn test_split_words_treats_newline_separately() {
let s = "example.\ncom";
let res = split_words(s);
assert_eq!(res, vec!["example", ".", "\n", "com"])
}
#[test]
fn test_split_words_single_unicode() {
let s = "a ö b";
let res = split_words(s);
assert_eq!(res, vec!["a", " ", "ö", " ", "b"])
}
#[test]
fn test_split_words_single_unicode_not_alphabetic() {
let s = "a 💝 b";
let res = split_words(s);
assert_eq!(res, vec!["a", " ", "💝", " ", "b"])
}
#[test]
fn test_split_words_unicode() {
let s = "a xöy b";
let res = split_words(s);
assert_eq!(res, vec!["a", " ", "xöy", " ", "b"])
}
#[test]
fn test_split_words_and_numbers() {
let s = "a123b";
let res = split_words_and_numbers(s);
assert_eq!(res, vec!["a", "123", "b"])
}
#[test]
fn test_split_words_and_numbers_spaces() {
let s = "foo bar";
let res = split_words_and_numbers(s);
assert_eq!(res, vec!["foo", " ", "bar"])
}
}