寻找中文自描述句子

来源：互联网发布：mac如何移动文件夹编辑：程序博客网时间：2024/05/17 07:55

在字母语言里面自描述句子的例子( see http://hi.baidu.com/mynana/blog/item/bfb3aeafa707a1cc7dd92a04.html):
       "Only the fool would take trouble to verify that this sentence was composed of ten a's, three b's, four c's, four d's, forty-six e's, sixteen f's, four g's, thirteen h's, fifteen i's, two k's, nine l's, four m's, twenty-five n's, twenty-four o's. five p's, sixteen r's, forty-one s's, thirty-seven t's, ten u's, eight v's, eight w's, four x's, eleven y's, twenty-seven commas, twenty-three apostrophes, seven hyphens and, last but not least, a single !"
    句子的主体说明了字母的个数。
    我想到的是要找到‘中文’自描述句子，因为没有字母，所以用笔画取代,形如:
        "本个句子有一十二个一划字一十三个二划字一十九个三划字一十个四划字一十七个五划字七个六划字"
    注意这个句子只是样例，真正是否存在还是未知的.
    采用了一种搜索方法:从一个状态开始，对任意一个笔划作校正，如果碰到重复的句子就让全部笔划随机跳到另外一个状态.
    这是昨天的思路，有两个有意思的运行现象:
    1) 因为采用了python的dict做hash), key是中文的数字，unicode,比较消耗内存, ubuntu已经到了90％的内存＋1个G的swap，到后来cpu什么都不干了，光作磁盘swap来回查hash; 后来还是死机了...
    2) 随机算法到最后会有非常少的遗漏状态空间，来回跳，如果要遍历完会花很长时间

    针对第一个做了调整，优化了hash存储，用阿拉伯数字作key；对第二个就是在剩余空间小于一个值后，直接上蛮力搜索.
    现在能够搜索完整了，但是又发现第三个现象:
    3) 不是每个句子都可能构造出的
    比如上面那句就不行，但是这个句子就可以，尽管不太通顺，因为有两个‘个’字:
       "本个个句子有二个一划字六个二划字一十个三划字三个四划字九个五划字七个六划字"
    没关系，只是实验而言，剩下的任务就是构造通顺的句子了, 基本上就是随便加一些或者改一些就好，第一个中文自描述句子诞生了!:
    ==================================================
   "这个句子有二个一划字九个二划字九个三划字二个四划字九个五划字八个六划字一个七划字"
    ===================================================

代码如下:

#-*- coding: UTF-8 -*-
import random
import operator
def get_cnt( s, i ):
    res = 0
    for c in s:
        j = word_cnt[ c ]
        if j == i:
            res += 1
    return res
def stat( s ):
    global word_cnt
    cnts = [ 0 ] * 7
    for c in s:
        cnts[ word_cnt[ c ] - 1 ] += 1
    return cnts    
def convert_num_to_chinese( i ):
    ''''' assume i < 100 '''
    global chinese_nums
    first, second = divmod( i, 10 ) 
    s = ""
    if first != 0:
        s += "".join( ( chinese_nums[ first ],  u"十" ) )
    if second != 0:
        s += chinese_nums[ second ]
    return s    
def convert_chines_to_num( s ):
    ''''' assume s < 100 '''
    if len( s ) == 1:
        return nums_chinese[ s ]
    else:
        num = 0
        first, second = s.split( u'十' )
        if first:
            num += nums_chinese[ first ] * 10
        if second:
            num += nums_chinese[ second ] 
        return num    
def combine_sentence():
    global prefix, postfixes, chinese_cnts 
    return prefix + "".join( map( operator.add,  chinese_cnts, postfixes ) )
    
chinese_nums = dict( zip( range( 1, 10), u'一二三四五六七八九' ) ) 
nums_chinese = dict( zip(  u'一二三四五六七八九十' , range( 1, 11) ) ) 
word_cnt = dict( zip( (u'这本个句子有划字一二三四五六七八九十'), 
                      (7,5,3,5,2,6,6,5,1,2,3,5,4,4,2,2,2,2) )
               ) 
prefix = u'这个句子有' 
postfixes = [ u'个一划字', u'个二划字', u'个三划字',  u'个四划字', u'个五划字', u'个六划字', u'个七划字']
init_cnts = stat(  prefix + "".join( postfixes ) ) 
chinese_cnts = map( convert_num_to_chinese, init_cnts )
print ",".join( chinese_cnts )
print convert_chines_to_num( u'二十一')
print convert_chines_to_num( u'十一')
print convert_chines_to_num( u'二十')
print convert_chines_to_num( u'二')
#print stat( u'本个个句子有二个一划字六个二划字一十个三划字三个四划字九个五划字七个六划字' ) 
#s = u'这个句子有二个一划字九个二划字九个三划字二个四划字九个五划字八个六划字一个七划字' 
#print stat( s )
#print "".join( sorted( s, cmp = lambda x,y: cmp( word_cnt[x], word_cnt[y]) ) )
#import sys; sys.exit()
MaxRange = 12
EndSize = MaxRange ** 5 * 99 / 100
found = 0
history = {}
while not found:
    sentence = combine_sentence()
    print sentence
    print '-' * 50
    print len( history ) 
    key = " ".join( map( lambda x: str( convert_chines_to_num(x) ),  chinese_cnts ) ) 
    if history.has_key( key ):
        cnts = [ random.randint( 0, MaxRange ) for i in range(5) ]
        cnts = map( operator.add, cnts, init_cnts[:5] )
        chinese_cnts = map( convert_num_to_chinese, cnts ) +  chinese_cnts[5:] 
        print 'same, jump to ', cnts
        continue
    history[ key ] = 1 
    cnts = stat( sentence )
    real_cnts = map( convert_num_to_chinese, cnts )
    print ",".join( real_cnts )
    print ",".join( chinese_cnts )
    
    if chinese_cnts == real_cnts: 
        found = 1
        print 'found'
    else:    
        # no need to change six, for no six in chinese number
        i = random.choice( range(0, 5 ) ) 
        cnt = get_cnt( sentence, i + 1 )     
        chinese_cnts[ i ] = convert_num_to_chinese( cnt )
    if len( history ) > EndSize:
        break
#traverse instead of random choose 
if not found:
    for s1 in range(0, MaxRange ):
        for s2 in range(0, MaxRange ):
            for s3 in range(0, MaxRange ):
                for s4 in range(0, MaxRange ):
                    for s5 in range(0, MaxRange ):
                        cnts = [ s1, s2, s3, s4, s5 ]
                        cnts = map( operator.add, cnts, init_cnts[:5] )
                        chinese_cnts = map( convert_num_to_chinese, cnts ) + chinese_cnts[5:] 
                        key = " ".join( map( lambda x: str( convert_chines_to_num(x) ),  chinese_cnts ) ) 
                        if not history.has_key( key ):
                            sentence = combine_sentence()
                            cnts = stat( sentence )
                            real_cnts = map( convert_num_to_chinese, cnts )
                            if chinese_cnts == real_cnts: 
                                found = 1
                                print 'found'
                            print sentence
if not found:
    print 'impossible'