word_to_num <- function(x) {
# normalize
x <- tolower(x)
# direct numbers
if (grepl("\\b\\d+\\b", x)) {
return(as.integer(regmatches(x, regexpr("\\b\\d+\\b", x))))
}
# hyphenated like "5-0"
if (grepl("\\b\\d+\\s*-\\s*\\d+\\b", x)) {
parts <- as.integer(unlist(strsplit(
regmatches(x, regexpr("\\b\\d+\\s*-\\s*\\d+\\b", x)),
"\\s*-\\s*"
)))
return(10 * parts[1] + parts[2])
}
# simple word numbers
ones <- c(
zero = 0,
one = 1,
two = 2,
three = 3,
four = 4,
five = 5,
six = 6,
seven = 7,
eight = 8,
nine = 9,
ten = 10,
eleven = 11,
twelve = 12,
thirteen = 13,
fourteen = 14,
fifteen = 15,
sixteen = 16,
seventeen = 17,
eighteen = 18,
nineteen = 19
)
tens <- c(
twenty = 20,
thirty = 30,
forty = 40,
fifty = 50,
sixty = 60,
seventy = 70,
eighty = 80,
ninety = 90
)
# e.g., "nineteen"
if (x %in% names(ones)) {
return(ones[[x]])
}
# e.g., "thirty five" or "thirty-five"
x2 <- gsub("-", " ", x)
parts <- strsplit(x2, "\\s+")[[1]]
if (
length(parts) == 2 && parts[1] %in% names(tens) && parts[2] %in% names(ones)
) {
return(tens[[parts[1]]] + ones[[parts[2]]])
}
if (length(parts) == 1 && parts[1] %in% names(tens)) {
return(tens[[parts[1]]])
}
return(NA_integer_)
}
# Extract name candidates
extract_name <- function(s) {
# patterns that introduce a name
pats <- c(
"I go by\\s+([A-Z][a-z]+)",
"I'm\\s+([A-Z][a-z]+(?:\\s+[A-Z][a-z]+)?)",
"They call me\\s+([A-Z][a-z]+(?:\\s+[A-Z][a-z]+)?)",
"^([A-Z][a-z]+) here",
"The name's\\s+([A-Z][a-z]+)",
"^([A-Z][a-z]+)\\s" # fallback: leading capital word
)
for (p in pats) {
m <- regexpr(p, s, perl = TRUE)
if (m[1] != -1) {
return(sub(p, "\\1", regmatches(s, m)))
}
}
NA_character_
}
# Extract age phrases and convert to number
extract_age <- function(s) {
# capture common age phrases around a number
m <- regexpr(
"(\\b\\d+\\b|\\b\\d+\\s*-\\s*\\d+\\b|\\b[Nn][a-z-]+\\b)\\s*(years|year|birthday|young|this)",
s,
perl = TRUE
)
if (m[1] != -1) {
token <- sub(
"(years|year|birthday|young|this)$",
"",
trimws(substring(s, m, m + attr(m, "match.length") - 1))
)
return(word_to_num(token))
}
# handle pure word-number without trailing keyword (e.g., "Nineteen years young." handled above)
m2 <- regexpr("\\b([A-Z][a-z]+)\\b\\s+years", s, perl = TRUE)
if (m2[1] != -1) {
token <- tolower(sub("\\s+years.*", "", regmatches(s, m2)))
return(word_to_num(token))
}
# handle hyphenated "big 5-0"
m3 <- regexpr("big\\s+(\\d+\\s*-\\s*\\d+)", s, perl = TRUE)
if (m3[1] != -1) {
token <- sub("big\\s+", "", regmatches(s, m3))
return(word_to_num(token))
}
NA_integer_
}