İşte daha iyi bir çözüm olduğunu düşünüyorum.
library(stringdist)
library(gdata)
#Convert numeric words to digits
isNumericWord=function(string, dist=1, method="dl"){
nums=c("zero","one","two","three","four","five","six","seven","eight","nine",
"ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen",
"twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety",
"hundred","thousand","million","billion","trillion")
return(any(stringdist(tolower(string),nums,method=method)<=dist))
}
numberTypes=function(string, dist=1, method="dl"){
nums=c("zero","one","two","three","four","five","six","seven","eight","nine",
"ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen",
"twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety",
"hundred","thousand","million","billion","trillion")
string=gsub("[[:punct:]]"," ",string)
wrdsplit=strsplit(string,split=" ")[[1]]
wrdsplit=wrdsplit[wrdsplit!=""]
#Handle number types
wrdsplit=ifelse(stringdist("first",tolower(wrdsplit),method=method)<=dist,"one st",wrdsplit)
wrdsplit=ifelse(stringdist("second",tolower(wrdsplit),method=method)<=dist,"two nd",wrdsplit)
wrdsplit=ifelse(stringdist("third",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","three rd",wrdsplit)
wrdsplit=ifelse(stringdist("fourth",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","four th",wrdsplit)
wrdsplit=ifelse(stringdist("fifth",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","five th",wrdsplit)
wrdsplit=ifelse(stringdist("sixth",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","six th",wrdsplit)
wrdsplit=ifelse(stringdist("seventh",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","seven th",wrdsplit)
wrdsplit=ifelse(stringdist("eighth",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","eight th",wrdsplit)
wrdsplit=ifelse(stringdist("ninth",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","nine th",wrdsplit)
wrdsplit=ifelse(stringdist("tenth",tolower(wrdsplit),method=method)<=dist,"ten th",wrdsplit)
wrdsplit=ifelse(stringdist("twentieth",tolower(wrdsplit),method=method)<=dist,"twenty th",wrdsplit)
wrdsplit=ifelse(stringdist("thirtieth",tolower(wrdsplit),method=method)<=dist,"thirty th",wrdsplit)
wrdsplit=ifelse(stringdist("fortieth",tolower(wrdsplit),method=method)<=dist,"forty th",wrdsplit)
wrdsplit=ifelse(stringdist("fiftieth",tolower(wrdsplit),method=method)<=dist,"fifty th",wrdsplit)
wrdsplit=ifelse(stringdist("sixtieth",tolower(wrdsplit),method=method)<=dist,"sixty th",wrdsplit)
wrdsplit=ifelse(stringdist("seventieth",tolower(wrdsplit),method=method)<=dist,"seventy th",wrdsplit)
wrdsplit=ifelse(stringdist("eightieth",tolower(wrdsplit),method=method)<=dist,"eighty th",wrdsplit)
wrdsplit=ifelse(stringdist("ninetieth",tolower(wrdsplit),method=method)<=dist,"ninety th",wrdsplit)
#Handle other number words that end in "th"
if(length(wrdsplit)>0){
for(i in 1:length(wrdsplit)){
substr_end=substr(wrdsplit[i],(nchar(wrdsplit[i])-1),nchar(wrdsplit[i]))
substr_beg=substr(wrdsplit[i],1,(nchar(wrdsplit[i])-2))
if(substr_end=="th" & nchar(wrdsplit[i])!=2 & any(stringdist(tolower(substr_beg),nums,method=method)<=dist)){
wrdsplit[i]=paste(substr_beg, substr_end,sep=" ")
}
}
return(gsub(" "," ",paste(wrdsplit,collapse=" ")))
}else{
return("")
}
}
#Convert number words to digits
Word2Num=function(string, dist=1, method="dl"){
original=string
#Define numbers
one_digits = list(zero=0, one=1, two=2, three=3, four=4, five=5,
six=6, seven=7, eight=8, nine=9)
teens = list(eleven=11, twelve=12, thirteen=13, fourteen=14, fifteen=15,
sixteen=16, seventeen=17, eighteen=18, nineteen=19)
ten_digits = list(ten=10, twenty=20, thirty=30, forty=40, fifty=50,
sixty=60, seventy=70, eighty=80, ninety=90)
large_digits = list(hundred=100, thousand=1000, million=1e6, billion=1e9, trillion=1e12)
double_digits = c(teens,ten_digits)
#Split the string into words
string=gsub("-"," ",gsub(" & ", " and ",string,ignore.case=T))
string=numberTypes(string)
wrdsplit=strsplit(tolower(string)," ")[[1]]
wrdsplit=wrdsplit[wrdsplit!=""]
isNumber=apply(data.frame(wrdsplit),1,isNumericWord)
#Find groups of numbers
if(exists("groups")){
suppressWarnings(rm(groups))
}
i=1
while(i <= length(wrdsplit)){
if(isNumber[i]==T){
if(!exists("groups")){
groups=list(wrdsplit[i])
}else if(exists("groups")){
groups=c(groups, wrdsplit[i])
}
for(j in (i+1):length(wrdsplit)){
if(isNumber[j]){
groups[[length(groups)]]=c(groups[[length(groups)]],wrdsplit[j])
i=j+1
}else{
i=i+1
break
}
}
}else{
i=i+1
}
}
#Convert numeric words to numbers
if(exists("groups")){
groupNums=groups
for(j in 1:length(groups)){
for(i in 1:length(groups[[j]])){
#If word is a single digit number
if(any(stringdist(groups[[j]][i],names(one_digits),method=method)<=dist &
tolower(substr(groups[[j]][i],nchar(groups[[j]][i]),nchar(groups[[j]][i])))!="y")){
#If word is a single digit number
groupNums[[j]][i]=one_digits[stringdist(groups[[j]][i],names(one_digits),method=method)<=dist][[1]]
}else if(any(stringdist(groups[[j]][i],names(double_digits),method=method)<=dist)){
#If word is a double digit number
groupNums[[j]][i]=double_digits[stringdist(groups[[j]][i],names(double_digits),method=method)<=dist][[1]]
}else if(any(stringdist(groups[[j]][i],names(large_digits),method=method)<=dist)){
#If word is a large digit number
groupNums[[j]][i]=large_digits[stringdist(groups[[j]][i],names(large_digits),method=method)<=dist][[1]]
}
}
}
#Convert the separated numbers to a single number
defscipen=options("scipen")[[1]]
options(scipen=999)
for(i in 1:length(groups)){
if(length(groupNums[[i]])==1){
groupNums[[i]]=as.numeric(groupNums[[i]][1])
}else{
while(length(groupNums[[i]])>=2){
if(nchar(groupNums[[i]][2])>nchar(groupNums[[i]][1])){
#If the next word has more digits than the current word, multiply them
temp=as.numeric(groupNums[[i]][1])*as.numeric(groupNums[[i]][2])
}else if(nchar(groupNums[[i]][2])<nchar(groupNums[[i]][1])){
#if the next word has less digits than the current word, add them
temp=as.numeric(groupNums[[i]][1])+as.numeric(groupNums[[i]][2])
}
#Combine the results
if(length(groupNums[[i]])>2){
groupNums[[i]]=c(temp, groupNums[[i]][3:length(groupNums[[i]])])
}else{
groupNums[[i]]=temp
}
}
}
}
#Recreate the original string
groupNums=lapply(groupNums, as.character)
options(scipen=defscipen)
for(i in 1:length(groups)){
wrdsplit[which(wrdsplit==groups[[i]][1])]=groupNums[[i]][1]
if(length(groups[[i]]>1)){
wrdsplit[which(wrdsplit==groups[[i]][2:length(groups)])]=""
}
}
#Combine numbers with their endings
wrdsplit=wrdsplit[wrdsplit!=""]
if(any(wrdsplit[which(wrdsplit %in% unlist(groupNums))+1] %in% c("rd","th","st","nd"))){
locs=which(wrdsplit %in% unlist(groupNums))
for(i in length(locs):1){
wrdsplit[locs[i]]=paste(wrdsplit[c(locs[i],(locs[i]+1))],collapse="")
wrdsplit=wrdsplit[-(locs[i]+1)]
}
}
return(trim(paste(wrdsplit,collapse=" ")))
}else{
return(original)
}
}
@Henk Sorunuzu, kelimeyi sayıya dönüştürmeniz gerekenin tersini çevirmeyi daha açık hale getirmek için biraz yazdım. –
Yapılması gereken en iyi şey, bir dosyayı gönderen kişiyi kelimeler olarak yazılanlara çekmektir. Tamam, cidden, bunu yapmak için başka bir yolun var olduğunu sanmıyorum, tüm sayı kelimelerinin ('bir', 'iki', ... 'yüz', 'bin, devasa bir veritabanına sahip olan oldukça ayrıntılı bir ayrıştırma algoritması yazmaktan '...' googol ') yanı sıra öncelik için bir çeşit ağaç-sıralayıcı. Örneğin, örneğinizde, iki "yüz" vardır, ama onları sırayla takip eden kelimelere dayanan farklı anlamlara sahiptirler. –