实验背景
“图灵测试”是指测试者在与被测试者(一个人和一台机器)隔开的情况下,通过一些装置(如键盘)向被测试者随意提问。进行多次测试后,如果有超过30%的测试者不能确定出被测试者是人还是机器,那么这台机器就通过了测试,并被认为具有人类智能。
本实验设计了一个算法,使得机器能够通过图灵测试。即通过对所提供的文本进行分析,根据每个词之后的一定词数范围内的词出现的频率决定机器所给出的一句话之中下一个词是什么。本实验中提供了莎士比亚的作品,以达到机器能够“写诗”的目的。
1.SEQUENCE_UTIL
1.1String Tokens
tokens : (char -> bool) -> string -> string seq
函数功能:
返回所给字符串当中的不含给定字符的全部最大非空子串
函数代码:
val cp=not o Char.isAlphaNum
fun tokens (cp : char -> bool) (str : string) : string seq = let val n = String.size str val chars = tabulate (fn i => (i, String.sub (str, i))) n val idx = map (fn (i,_) => i) (filter (fn (_,c) => cp c) chars) (* grab substrings in between delimiters *) val subs = map2 (fn (i,i') => String.substring (str, i, i' - i)) (append (singleton 0, map (fn i => i + 1) idx)) (append (idx, singleton n)) in filter (fn s => size s > 0) subs end1.2Histograms
1.2.1 histograms
type 'a hist = ('a * int) seq
val histogram : 'a ord -> 'a seq -> 'a hist
函数功能:
返回一个(key,num)二元组构成的串,其中num为key在所给seq中出现的频次
函数代码:
fun histogram (cmp : 'a ord) (s : 'a seq) : 'a hist = map (fn (a, c) => (a, length c)) (collect cmp (map (fn a => (a, ())) s))1.2.2 Choose
val choose : 'a hist -> real -> 'a
函数功能:
对于给定的频率p,函数返回hist中出现频率等于p或者大于且最接近p的元素
实现思路:
先通过PerCentage函数将每个元素出现的频次转化为累加频率,其后通过二分查找法搜索即可,例如:
h = 〈(“a”,3),(“b”,5),(“c”,2)〉. “a” 0.3 “b” 0.8 “c” 1.0
函数代码:
fun PerCentage hist = let val Sum = #2 (nth (scani (fn ((l,a),(r,b))=> (r,a+b) ) (#1 (nth hist 0),0) hist) (length hist - 1)) in map (fn (n,v) => (n,Real.fromInt v / Real.fromInt Sum)) (scani (fn ((l,a),(r,b)) => (r,a+b) ) (#1 (nth hist 0),0) hist) end fun choose (hist : 'a hist) (p : real) : 'a = let val Hist = PerCentage hist in case (length hist) of 1 => #1 (nth hist 0) | n => let val mid = n div 2 val (PDis,Dis) = (#2 (nth Hist (mid-1)),#2 (nth Hist mid)) in if (p>PDis andalso p<= Dis) then #1 (nth hist (mid)) else if p>Dis then choose (subseq hist (mid,n-mid)) p else choose (subseq hist (0,mid)) p end end复杂度分析:
PerCentage
Sum:W=O(n) S=O(logn)
res:W=O(n) S=O(logn)
故W(PerCentage)=O(n) S(PerCentage)=O(logn)
Choose
由于采用二分查找,W(Choose)=O(n) S(Choose)=O(logn)
2.K-Gram Stats
2.1 kgramstats type
type kgramstats = ((token * int) seq T.table) * int
2.2 makeStats
val makeStats : string -> int -> kgramstats
函数功能:
makeStats corpus maxK
返回一个kgramstats类型
其中
table的key为输入的corpus的全部的长度小于maxK的非空子字符串(string)
value为在corpus中key之后的第一个单词以及其频率组成的二元组所构成的串
函数思路:
先找出全部子串以及紧接着的单词,在整合成一个table即可
函数代码:
fun cp c = case (Char.isAlphaNum c ) of true => false | false => true (* You must define the abstract kgramstats type *) (**) type kgramstats = ((token * int) seq T.table) * int fun tab (Seq:string seq) (x:int) = tabulate (fn i => if i=(length Seq - x) then (subseq Seq (i,x),("",1)) else (subseq Seq (i,x),(nth Seq (i+x),1))) (length Seq - x -1) fun gen Seq maxK = case maxK of ~1 => empty () | _ => append (tab Seq maxK,gen Seq (maxK-1)) fun makeStats (corpus : string) (maxK : int) : kgramstats = let val FirSeq = tokens cp corpus val PT1 = gen FirSeq maxK val PT2 = map (fn (str,seq) => (str,map (fn (l,r) => (l,length r)) (collect String.compare seq))) (collect (collate String.compare) PT1) in (Table.fromSeq PT2,maxK) end复杂度分析:
前提:假设maxK很小
tab:
W=O(n) S=O(1)
gen:
W=maxK*W(tab)=O(n) S=maxK*S(tab)=O(1)
makeStats:
W(collect (collate String.compare) PT1)=O(nlogn)
S(collect (collate String.compare) PT1)=O(log^2n)
W(map (fn (str,seq) => (str,map (fn (l,r) => (l,length r)) (collect String.compare seq))) (...))=O(nlogn)
S(map (fn (str,seq) => (str,map (fn (l,r) => (l,length r)) (collect String.compare seq))) (...))=O(log^2n)
W(Table.fromSeq)=O(nlogn)
S(Table.fromSeq)=O(log^2n)
故
W(makeStats)=O(nlogn)
S(makeStats)=O(log^2n)
2.3 lookupExts
val lookupExts : kgramstats -> kgram -> token hist
fun lookupExts (stats : kgramstats) (kgram : kgram) : (token * int) seq = case (Table.find (#1 stats) kgram) of NONE => empty () | SOME x =>x2.4 maxK
val maxK : kgramstats -> int
fun maxK (stats : kgramstats) : int = #2 stats
3 Babble
3.1 randomSentence
val randomSentence : kgramstats -> int -> Rand.rand -> string
fun randomSentence (stats : KS.kgramstats) (n : int) (seed : R.rand) = let val max = KS.maxK stats val Res = empty () val RealRand = Rand.randomRealSeq seed (SOME (0.0,1.0)) n val Pre = KS.Seq.singleton (Util.choose (KS.lookupExts stats Res) (KS.Seq.nth RealRand (n-1))) val _ = print ("This time the chosed word is : " ^ (String.concatWith " " (KS.Seq.toList Pre)) ^ "\n") fun nextHist stats x max = let val len = KS.Seq.length x val next = if len < max orelse len = max then x else KS.Seq.drop (x,len - max) val pre = KS.lookupExts stats next val res = case length pre of 0 => nextHist stats (drop (next,1)) max | _ => pre in res end fun f (x,r,a,n,stats,max) = case a of 0 => KS.Seq.empty() | _ => let val pre = nextHist stats x max val Word = Util.choose pre (KS.Seq.nth r a) val x' = KS.Seq.append (x,KS.Seq.singleton Word) in KS.Seq.append (KS.Seq.singleton Word,f (x',r,a-1,n,stats,max)) end in String.concat [(String.concatWith " " (KS.Seq.toList (append (Pre,(f (Pre,RealRand,n-1,n,stats,max)))))),"."] end3.2 randomDocument
val randomDocument : kgramstats -> int -> Rand.rand -> string
fun randomDocument (stats : KS.kgramstats) (n : int) (seed : R.rand) = let val IntRand = Rand.randomIntSeq seed (SOME (5,10)) n val SeedSeq = map Rand.fromInt (Rand.randomIntSeq seed (SOME (0,10000)) n) val pre = map2 (fn (x,s) => randomSentence stats x s) IntRand SeedSeq in "The result is : \n" ^ (String.concatWith " " (KS.Seq.toList pre)) end
