Gensim源代码详解——utils（持续更新中）

来源：互联网发布：wifi电话软件下载编辑：程序博客网时间：2024/06/04 17:45

#!/usr/bin/env python# -*- coding: utf-8 -*-## Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz># Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html"""这个模块包含各种通用实用函数。"""from __future__ import with_statementimport loggingimport warningslogger = logging.getLogger(__name__)try:    from html.entities import name2codepoint as n2cpexcept ImportError:    from htmlentitydefs import name2codepoint as n2cptry:    import cPickle as _pickleexcept ImportError:    import pickle as _pickleimport reimport unicodedataimport osimport randomimport itertoolsimport tempfilefrom functools import wraps  # for `synchronous` function lockimport multiprocessingimport shutilimport sysfrom contextlib import contextmanagerimport subprocessimport numpy as npimport numbersimport scipy.sparseif sys.version_info[0] >= 3:    unicode = strfrom six import iterkeys, iteritems, u, string_types, unichrfrom six.moves import xrangetry:    from smart_open import smart_openexcept ImportError:    logger.info("smart_open library not found; falling back to local-filesystem-only")    def make_closing(base, **attrs):        """        Add support for `with Base(attrs) as fout:` to the base class if it's missing.        The base class' `close()` method will be called on context exit, to always close the file properly.        This is needed for gzip.GzipFile, bz2.BZ2File etc in older Pythons (<=2.6), which otherwise        raise "AttributeError: GzipFile instance has no attribute '__exit__'".        """        if not hasattr(base, '__enter__'):            attrs['__enter__'] = lambda self: self        if not hasattr(base, '__exit__'):            attrs['__exit__'] = lambda self, type, value, traceback: self.close()        return type('Closing' + base.__name__, (base, object), attrs)    def smart_open(fname, mode='rb'):        _, ext = os.path.splitext(fname)        if ext == '.bz2':            from bz2 import BZ2File            return make_closing(BZ2File)(fname, mode)        if ext == '.gz':            from gzip import GzipFile            return make_closing(GzipFile)(fname, mode)        return open(fname, mode)PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE)RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE)def get_random_state(seed):    """    Turn seed into a np.random.RandomState instance.    Method originally from maciejkula/glove-python, and written by @joshloyal.    """    if seed is None or seed is np.random:        return np.random.mtrand._rand    if isinstance(seed, (numbers.Integral, np.integer)):        return np.random.RandomState(seed)    if isinstance(seed, np.random.RandomState):        return seed    raise ValueError('%r cannot be used to seed a np.random.RandomState instance' % seed)def synchronous(tlockname):    """    A decorator to place an instance-based lock around a method.    Adapted from http://code.activestate.com/recipes/577105-synchronization-decorator-for-class-methods/    """    def _synched(func):        @wraps(func)        def _synchronizer(self, *args, **kwargs):            tlock = getattr(self, tlockname)            logger.debug("acquiring lock %r for %s" % (tlockname, func.__name__))            with tlock:  # use lock as a context manager to perform safe acquire/release pairs                logger.debug("acquired lock %r for %s" % (tlockname, func.__name__))                result = func(self, *args, **kwargs)                logger.debug("releasing lock %r for %s" % (tlockname, func.__name__))                return result        return _synchronizer    return _synchedclass NoCM(object):    def acquire(self):        pass    def release(self):        pass    def __enter__(self):        pass    def __exit__(self, type, value, traceback):        passnocm = NoCM()@contextmanagerdef file_or_filename(input):    """    返回一个类似文件的对象，准备从一开始就读取。“输入”    文件名(gz/bz2也支持)或类似于文件的对象支持查找。    """    if isinstance(input, string_types):        # input was a filename: open as file        yield smart_open(input)    else:        # input already a file-like object; just reset to the beginning        input.seek(0)        yield input#字符转换def deaccent(text):    """    从给定的字符串中删除强调。输入文本要么是unicode字符串，要么是utf8编码的bytestring。    返回带有口音的输入字符串，如unicode。    >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")    u'Sef chomutovskych komunistu dostal postou bily prasek'    """    if not isinstance(text, unicode):        # assume utf8 for byte strings, use default (strict) error handling        text = text.decode('utf8')    norm = unicodedata.normalize("NFD", text)    result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')    return unicodedata.normalize("NFC", result)def copytree_hardlink(source, dest):    """    递归地复制一个目录，这是一个目录。copytree,但hardlink文件而不是复制。仅在UNIX系统上可用。    """    copy2 = shutil.copy2    try:        shutil.copy2 = os.link        shutil.copytree(source, dest)    finally:        shutil.copy2 = copy2def tokenize(text, lowercase=False, deacc=False, encoding='utf8', errors="strict", to_lower=False,             lower=False):    """    迭代地将标记作为unicode字符串，去掉重音符号        通过赋值，可以选择将unidoce字符串设置为小写        对于其中一个参数，小写的，低的，或低的。        输入文本可以是unicode的，也可以是utf-8编码的字节字符串。        输出的标记是字母的最大连续序列。        字符(没有数字!)。    >>> list(tokenize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!', deacc = True))    [u'Nic', u'nemuze', u'letet', u'rychlosti', u'vyssi', u'nez', u'tisic', u'kilometru', u'za', u'sekundu']    """    lowercase = lowercase or to_lower or lower    text = to_unicode(text, encoding, errors=errors)    if lowercase:        text = text.lower()    if deacc:        text = deaccent(text)    return simple_tokenize(text)def simple_tokenize(text):    for match in PAT_ALPHABETIC.finditer(text):        yield match.group()def simple_preprocess(doc, deacc=False, min_len=2, max_len=15):    """    将文档转换为标记列表。    这个小写的例子，标记化，去重音(可选的)。输出是最后的结果    令牌=unicode字符串，它不会再被处理。    """    tokens = [        token for token in tokenize(doc, lower=True, deacc=deacc, errors='ignore')        if min_len <= len(token) <= max_len and not token.startswith('_')    ]    return tokensdef any2utf8(text, errors='strict', encoding='utf8'):    """将字符串(编码中的unicode或bytestring)转换为utf8中的bytestring。"""    if isinstance(text, unicode):        return text.encode('utf8')    # do bytestring -> unicode -> utf8 full circle, to ensure valid utf8    return unicode(text, encoding, errors=errors).encode('utf8')to_utf8 = any2utf8def any2unicode(text, encoding='utf8', errors='strict'):    """将字符串(由编码或unicode编码的bytestring)转换为unicode。"""    if isinstance(text, unicode):        return text    return unicode(text, encoding, errors=errors)to_unicode = any2unicodedef call_on_class_only(*args, **kwargs):    """Raise exception when load methods are called on instance"""    raise AttributeError('This method should be called on a class object.')class SaveLoad(object):    """    从这个类继承的对象具有保存/加载函数，这是一个解/pickle的函数    他们到磁盘。    这将使用pickle来进行/序列化，因此对象不能包含    不可picklable属性，例如lambda函数等。    """    @classmethod    def load(cls, fname, mmap=None):        """        从文件中加载一个先前保存的对象(也可以看到保存)。        如果使用大型数组存储了对象，则可以加载该对象        通过mmap(共享内存)使用mmap='r'来实现这些数组。默认值:不使用        mmap，作为普通对象加载大数组。        如果正在加载的文件被压缩了(或者“。”广州’或‘bz2”)        mmap=没有设置。如果这种情况下，负载将会增加一个IOError。        遇到。        """        logger.info("loading %s object from %s" % (cls.__name__, fname))        compress, subname = SaveLoad._adapt_by_suffix(fname)        obj = unpickle(fname)        obj._load_specials(fname, mmap, compress, subname)        logger.info("loaded %s", fname)        return obj    def _load_specials(self, fname, mmap, compress, subname):        """        加载特殊存储的属性，并提供相同的属性        机会递归地包含了SaveLoad实例。        """        mmap_error = lambda x, y: IOError(            'Cannot mmap compressed object %s in file %s. ' % (x, y) +            'Use `load(fname, mmap=None)` or uncompress files manually.')        for attrib in getattr(self, '__recursive_saveloads', []):            cfname = '.'.join((fname, attrib))            logger.info("loading %s recursively from %s.* with mmap=%s" % (                attrib, cfname, mmap))            getattr(self, attrib)._load_specials(cfname, mmap, compress, subname)        for attrib in getattr(self, '__numpys', []):            logger.info("loading %s from %s with mmap=%s" % (                attrib, subname(fname, attrib), mmap))            if compress:                if mmap:                    raise mmap_error(attrib, subname(fname, attrib))                val = np.load(subname(fname, attrib))['val']            else:                val = np.load(subname(fname, attrib), mmap_mode=mmap)            setattr(self, attrib, val)        for attrib in getattr(self, '__scipys', []):            logger.info("loading %s from %s with mmap=%s" % (                attrib, subname(fname, attrib), mmap))            sparse = unpickle(subname(fname, attrib))            if compress:                if mmap:                    raise mmap_error(attrib, subname(fname, attrib))                with np.load(subname(fname, attrib, 'sparse')) as f:                    sparse.data = f['data']                    sparse.indptr = f['indptr']                    sparse.indices = f['indices']            else:                sparse.data = np.load(subname(fname, attrib, 'data'), mmap_mode=mmap)                sparse.indptr = np.load(subname(fname, attrib, 'indptr'), mmap_mode=mmap)                sparse.indices = np.load(subname(fname, attrib, 'indices'), mmap_mode=mmap)            setattr(self, attrib, sparse)        for attrib in getattr(self, '__ignoreds', []):            logger.info("setting ignored attribute %s to None" % (attrib))            setattr(self, attrib, None)    @staticmethod    def _adapt_by_suffix(fname):        """Give appropriate compress setting and filename formula"""        if fname.endswith('.gz') or fname.endswith('.bz2'):            compress = True            subname = lambda *args: '.'.join(list(args) + ['npz'])        else:            compress = False            subname = lambda *args: '.'.join(list(args) + ['npy'])        return (compress, subname)    def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2,                    ignore=frozenset(), pickle_protocol=2):        """        将对象保存到文件中(也可以看到load)。        如果分别是None，则自动检测到大        numpy / scipy。存储和存储对象中的稀疏数组        成单独的文件。这避免了pickle内存错误        允许mmap在负载上有效地返回大型阵列。        你也可以手动设置，在这种情况下，它必须是        将属性名存储在单独的文件中。那        在本例中没有执行自动检查。        忽略是一组不序列化的属性名(文件        处理、缓存等)。在随后的load()这些属性中        被设置为None。        pickleprotocol默认为2，因此可以导入pickle对象        在python2和3中。        """        logger.info(            "saving %s object under %s, separately %s" % (                self.__class__.__name__, fname, separately))        compress, subname = SaveLoad._adapt_by_suffix(fname)        restores = self._save_specials(fname, separately, sep_limit, ignore, pickle_protocol,                                       compress, subname)        try:            pickle(self, fname, protocol=pickle_protocol)        finally:            # restore attribs handled specially            for obj, asides in restores:                for attrib, val in iteritems(asides):                    setattr(obj, attrib, val)        logger.info("saved %s", fname)    def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, compress, subname):        """        Save aside any attributes that need to be handled separately, including        by recursion any attributes that are themselves SaveLoad instances.        Returns a list of (obj, {attrib: value, ...}) settings that the caller        should use to restore each object's attributes that were set aside        during the default pickle().        """        asides = {}        sparse_matrices = (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)        if separately is None:            separately = []            for attrib, val in iteritems(self.__dict__):                if isinstance(val, np.ndarray) and val.size >= sep_limit:                    separately.append(attrib)                elif isinstance(val, sparse_matrices) and val.nnz >= sep_limit:                    separately.append(attrib)        # whatever's in `separately` or `ignore` at this point won't get pickled        for attrib in separately + list(ignore):            if hasattr(self, attrib):                asides[attrib] = getattr(self, attrib)                delattr(self, attrib)        recursive_saveloads = []        restores = []        for attrib, val in iteritems(self.__dict__):            if hasattr(val, '_save_specials'):  # better than 'isinstance(val, SaveLoad)' if IPython reloading                recursive_saveloads.append(attrib)                cfname = '.'.join((fname, attrib))                restores.extend(val._save_specials(                    cfname, None, sep_limit, ignore,                    pickle_protocol, compress, subname))        try:            numpys, scipys, ignoreds = [], [], []            for attrib, val in iteritems(asides):                if isinstance(val, np.ndarray) and attrib not in ignore:                    numpys.append(attrib)                    logger.info("storing np array '%s' to %s" % (                        attrib, subname(fname, attrib)))                    if compress:                        np.savez_compressed(subname(fname, attrib), val=np.ascontiguousarray(val))                    else:                        np.save(subname(fname, attrib), np.ascontiguousarray(val))                elif isinstance(val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and attrib not in ignore:                    scipys.append(attrib)                    logger.info("storing scipy.sparse array '%s' under %s" % (                        attrib, subname(fname, attrib)))                    if compress:                        np.savez_compressed(                            subname(fname, attrib, 'sparse'),                            data=val.data,                            indptr=val.indptr,                            indices=val.indices)                    else:                        np.save(subname(fname, attrib, 'data'), val.data)                        np.save(subname(fname, attrib, 'indptr'), val.indptr)                        np.save(subname(fname, attrib, 'indices'), val.indices)                    data, indptr, indices = val.data, val.indptr, val.indices                    val.data, val.indptr, val.indices = None, None, None                    try:                        # store array-less object                        pickle(val, subname(fname, attrib), protocol=pickle_protocol)                    finally:                        val.data, val.indptr, val.indices = data, indptr, indices                else:                    logger.info("not storing attribute %s" % (attrib))                    ignoreds.append(attrib)            self.__dict__['__numpys'] = numpys            self.__dict__['__scipys'] = scipys            self.__dict__['__ignoreds'] = ignoreds            self.__dict__['__recursive_saveloads'] = recursive_saveloads        except:            # restore the attributes if exception-interrupted            for attrib, val in iteritems(asides):                setattr(self, attrib, val)            raise        return restores + [(self, asides)]    def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2,             ignore=frozenset(), pickle_protocol=2):        """        Save the object to file (also see `load`).        `fname_or_handle` is either a string specifying the file name to        save to, or an open file-like object which can be written to. If        the object is a file handle, no special array handling will be        performed; all attributes will be saved to the same file.        If `separately` is None, automatically detect large        numpy/scipy.sparse arrays in the object being stored, and store        them into separate files. This avoids pickle memory errors and        allows mmap'ing large arrays back on load efficiently.        You can also set `separately` manually, in which case it must be        a list of attribute names to be stored in separate files. The        automatic check is not performed in this case.        `ignore` is a set of attribute names to *not* serialize (file        handles, caches etc). On subsequent load() these attributes will        be set to None.        `pickle_protocol` defaults to 2 so the pickled object can be imported        in both Python 2 and 3.        """        try:            _pickle.dump(self, fname_or_handle, protocol=pickle_protocol)            logger.info("saved %s object" % self.__class__.__name__)        except TypeError:  # `fname_or_handle` does not have write attribute            self._smart_save(fname_or_handle, separately, sep_limit, ignore,                             pickle_protocol=pickle_protocol)#endclass SaveLoaddef identity(p):    """Identity fnc, for flows that don't accept lambda (pickling etc)."""    return pdef get_max_id(corpus):    """    Return the highest feature id that appears in the corpus.    For empty corpora (no features at all), return -1.    """    maxid = -1    for document in corpus:        maxid = max(maxid, max([-1] + [fieldid for fieldid, _ in document]))  # [-1] to avoid exceptions from max(empty)    return maxid#产生ID的关键类class FakeDict(object):    """    这个类的对象充当映射整数-str(integer)的字典    一个指定范围的整数小于0，numterms)。    这是为了避免在num术语庞大时分配真正的字典。    是对记忆的浪费。    """    def __init__(self, num_terms):        self.num_terms = num_terms    def __str__(self):        return "FakeDict(num_terms=%s)" % self.num_terms    def __getitem__(self, val):        if 0 <= val < self.num_terms:            return str(val)        raise ValueError("internal id out of bounds (%s, expected <0..%s))" %                         (val, self.num_terms))    def iteritems(self):        for i in xrange(self.num_terms):            yield i, str(i)    def keys(self):        """        重写.keys()函数，该函数用于确定最大值        语料库的内部id=词汇的维数。        为了避免实现整个范围(0，self。num术语)”,这将返回        最高的id=self。num_terms - 1]。        """        return [self.num_terms - 1]    def __len__(self):        return self.num_terms    def get(self, val, default=None):        if 0 <= val < self.num_terms:            return str(val)        return defaultdef dict_from_corpus(corpus):    """    扫描语料库中出现的所有单词id，然后构造并返回一个映射    它映射每个wordId-str(wordId)。    每当需要显示单词时，这个函数就会被使用(与之相反的是    他们的id)但是没有文字映射。生成的映射    只涵盖语料库中实际使用的词，最高的词是被发现的。        """    num_terms = 1 + get_max_id(corpus)    id2word = FakeDict(num_terms)    return id2worddef is_corpus(obj):    """   检查obj是否是一个文集。返回(is文集，新)2元组，在那里    obj是obj，如果obj是可迭代的，或者新生成的序列是相同的    obj是一个迭代器。    obj是一个语料库，如果它支持对文档进行迭代，那就是文档    反过来，任何操作都是2元组(int，float)的序列。    注意:一个“空的”语料库(空的输入序列)是不确定的，所以在本例中    结果被强制定义为is文集=False。    """    try:        if 'Corpus' in obj.__class__.__name__:  # the most common case, quick hack            return True, obj    except:        pass    try:        if hasattr(obj, 'next') or hasattr(obj, '__next__'):            # the input is an iterator object, meaning once we call next()            # that element could be gone forever. we must be careful to put            # whatever we retrieve back again            doc1 = next(obj)            obj = itertools.chain([doc1], obj)        else:            doc1 = next(iter(obj))  # empty corpus is resolved to False here        if len(doc1) == 0:  # sparse documents must have a __len__ function (list, tuple...)            return True, obj  # the first document is empty=>assume this is a corpus        id1, val1 = next(iter(doc1))  # if obj is a 1D numpy array(scalars) instead of 2-tuples, it resolves to False here        id1, val1 = int(id1), float(val1)  # must be a 2-tuple (integer, float)    except Exception:        return False, obj    return True, objdef get_my_ip():    """    尝试获取我们的外部ip(从pyro名称服务器的角度)    它试图回避伪造/etc/hosts条目和其他的问题    本地的错误配置，常常会破坏主机名的解析。    如果所有其他方法都失败了，则返回到简单的socket.gethostbyname()的查找。    """    import socket    try:        import Pyro4        # we know the nameserver must exist, so use it as our anchor point        ns = Pyro4.naming.locateNS()        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)        s.connect((ns._pyroUri.host, ns._pyroUri.port))        result, port = s.getsockname()    except:        try:            # see what ifconfig says about our default interface            import commands            result = commands.getoutput("ifconfig").split("\n")[1].split()[1][5:]            if len(result.split('.')) != 4:                raise Exception()        except:            # give up, leave the resolution to gethostbyname            result = socket.gethostbyname(socket.gethostname())    return resultclass RepeatCorpus(SaveLoad):    """    Used in the tutorial on distributed computing and likely not useful anywhere else.    """    def __init__(self, corpus, reps):        """        将文集作为另一个长度代表的语料库。这是通过    不断重复来自文集的文档，直到被请求    len(结果)= =代表的长度。重复做    出现动态=有效,通过“itertools”。        >>> corpus = [[(1, 0.5)], []] # 2 documents        >>> list(RepeatCorpus(corpus, 5)) # repeat 2.5 times to get 5 documents        [[(1, 0.5)], [], [(1, 0.5)], [], [(1, 0.5)]]        """        self.corpus = corpus        self.reps = reps    def __iter__(self):        return itertools.islice(itertools.cycle(self.corpus), self.reps)class RepeatCorpusNTimes(SaveLoad):    def __init__(self, corpus, n):        """        Repeat a `corpus` `n` times.        >>> corpus = [[(1, 0.5)], []]        >>> list(RepeatCorpusNTimes(corpus, 3)) # repeat 3 times        [[(1, 0.5)], [], [(1, 0.5)], [], [(1, 0.5)], []]        """        self.corpus = corpus        self.n = n    def __iter__(self):        for _ in xrange(self.n):            for document in self.corpus:                yield documentclass ClippedCorpus(SaveLoad):    def __init__(self, corpus, max_docs=None):        """       返回一个语料库，它是输入可迭代语料库的“头部”。    maxdocs之后的任何文档都将被忽略。这有效地限制了    返回的语料库的长度到小于=maxdocs。设置“max_docs =没有”    “没有限制”，有效地包装了整个输入语料库。        """        self.corpus = corpus        self.max_docs = max_docs    def __iter__(self):        return itertools.islice(self.corpus, self.max_docs)    def __len__(self):        return min(self.max_docs, len(self.corpus))class SlicedCorpus(SaveLoad):    def __init__(self, corpus, slice_):        """        返回一个语料库，它是输入可迭代语料库的一部分。        只有当语料库可索引时，才可以使用负面切片。        否则，将对文集进行迭代。        切片也可以是一个np。ndarray来支持复杂的索引。        注意:计算切片库的大小是很昂贵的        当使用切片时，必须对其进行遍历。        使用列表或np。ndarray没有这个缺点，但是        消耗更多的内存。        """        self.corpus = corpus        self.slice_ = slice_        self.length = None    def __iter__(self):        if hasattr(self.corpus, 'index') and len(self.corpus.index) > 0:            return (self.corpus.docbyoffset(i) for i in                    self.corpus.index[self.slice_])        else:            return itertools.islice(self.corpus, self.slice_.start,                                    self.slice_.stop, self.slice_.step)    def __len__(self):        # check cached length, calculate if needed        if self.length is None:            if isinstance(self.slice_, (list, np.ndarray)):                self.length = len(self.slice_)            else:                self.length = sum(1 for x in self)        return self.lengthdef safe_unichr(intval):    try:        return unichr(intval)    except ValueError:        # ValueError: unichr() arg not in range(0x10000) (narrow Python build)        s = "\\U%08x" % intval        # return UTF16 surrogate pair        return s.decode('unicode-escape')def decode_htmlentities(text):    """    Decode HTML entities in text, coded as hex, decimal or named.    Adapted from http://github.com/sku/python-twitter-ircbot/blob/321d94e0e40d0acc92f5bf57d126b57369da70de/html_decode.py    >>> u = u'E tu vivrai nel terrore - L&#x27;aldil&#xE0; (1981)'    >>> print(decode_htmlentities(u).encode('UTF-8'))    E tu vivrai nel terrore - L'aldilà (1981)    >>> print(decode_htmlentities("l&#39;eau"))    l'eau    >>> print(decode_htmlentities("foo &lt; bar"))    foo < bar    """    def substitute_entity(match):        try:            ent = match.group(3)            if match.group(1) == "#":                # decoding by number                if match.group(2) == '':                    # number is in decimal                    return safe_unichr(int(ent))                elif match.group(2) in ['x', 'X']:                    # number is in hex                    return safe_unichr(int(ent, 16))            else:                # they were using a name                cp = n2cp.get(ent)                if cp:                    return safe_unichr(cp)                else:                    return match.group()        except:            # in case of errors, return original input            return match.group()    return RE_HTML_ENTITY.sub(substitute_entity, text)def chunkize_serial(iterable, chunksize, as_numpy=False):    """   返回元素的返回元素，在纯大小的列表中。最后返回    元素可能更小(如果集合的长度不能被块大小整除)。    >>> print(list(grouper(range(10), 3)))    [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]    """    it = iter(iterable)    while True:        if as_numpy:            # convert each document to a 2d numpy array (~6x faster when transmitting            # chunk data over the wire, in Pyro)            wrapped_chunk = [[np.array(doc) for doc in itertools.islice(it, int(chunksize))]]        else:            wrapped_chunk = [list(itertools.islice(it, int(chunksize)))]        if not wrapped_chunk[0]:            break        # memory opt: wrap the chunk and then pop(), to avoid leaving behind a dangling reference        yield wrapped_chunk.pop()grouper = chunkize_serialclass InputQueue(multiprocessing.Process):    def __init__(self, q, corpus, chunksize, maxsize, as_numpy):        super(InputQueue, self).__init__()        self.q = q        self.maxsize = maxsize        self.corpus = corpus        self.chunksize = chunksize        self.as_numpy = as_numpy    def run(self):        it = iter(self.corpus)        while True:            chunk = itertools.islice(it, self.chunksize)            if self.as_numpy:                # HACK XXX convert documents to numpy arrays, to save memory.                # This also gives a scipy warning at runtime:                # "UserWarning: indices array has non-integer dtype (float64)"                wrapped_chunk = [[np.asarray(doc) for doc in chunk]]            else:                wrapped_chunk = [list(chunk)]            if not wrapped_chunk[0]:                self.q.put(None, block=True)                break            try:                qsize = self.q.qsize()            except NotImplementedError:                qsize = '?'            logger.debug("prepared another chunk of %i documents (qsize=%s)" %                        (len(wrapped_chunk[0]), qsize))            self.q.put(wrapped_chunk.pop(), block=True)#endclass InputQueueif os.name == 'nt':    warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")    def chunkize(corpus, chunksize, maxsize=0, as_numpy=False):        for chunk in chunkize_serial(corpus, chunksize, as_numpy=as_numpy):            yield chunkelse:    def chunkize(corpus, chunksize, maxsize=0, as_numpy=False):        """        将一串值分成更小的块。        每个块的长度都是块大小，除了最后一个可能比较小的块。        一个曾经的输入流(来自生成器的语料库)是可以的，分块完成        有效地出现通过itertools。        如果maxsize-1，不要在连续的块收益之间等待，但是        相反，要不断地填充一个短队列(大小为maxsize)        提前的块。这是通过启动一个单独的进程来实现的，并且        这意味着减少输入/输出延迟，这在语料库出现时是非常重要的        从慢速介质(如硬盘)。        如果maxsize==0，不要在并行操作中浪费时间，只需要简单地生成块大小        通过“chunkize_serial()(没有I / O优化)。        >>> for chunk in chunkize(range(10), 4): print(chunk)        [0, 1, 2, 3]        [4, 5, 6, 7]        [8, 9]        """        assert chunksize > 0        if maxsize > 0:            q = multiprocessing.Queue(maxsize=maxsize)            worker = InputQueue(q, corpus, chunksize, maxsize=maxsize, as_numpy=as_numpy)            worker.daemon = True            worker.start()            while True:                chunk = [q.get(block=True)]                if chunk[0] is None:                    break                yield chunk.pop()        else:            for chunk in chunkize_serial(corpus, chunksize, as_numpy=as_numpy):                yield chunkdef smart_extension(fname, ext):    fname, oext = os.path.splitext(fname)    if oext.endswith('.bz2'):        fname = fname + oext[:-4] + ext + '.bz2'    elif oext.endswith('.gz'):        fname = fname + oext[:-3] + ext + '.gz'    else:        fname = fname + oext + ext    return fnamedef pickle(obj, fname, protocol=2):    """Pickle object `obj` to file `fname`.    `protocol` defaults to 2 so pickled objects are compatible across    Python 2.x and 3.x.    """    with smart_open(fname, 'wb') as fout:  # 'b' for binary, needed on Windows        _pickle.dump(obj, fout, protocol=protocol)def unpickle(fname):    """Load pickled object from `fname`"""    with smart_open(fname, 'rb') as f:        # Because of loading from S3 load can't be used (missing readline in smart_open)        if sys.version_info > (3, 0):            return _pickle.load(f, encoding='latin1')        else:            return _pickle.loads(f.read())def revdict(d):    """    反向映射字典。    当两个键映射到相同的值时，只有一个键会被保存在这个值中。    结果(保留的是任意的)。    """    return dict((v, k) for (k, v) in iteritems(dict(d)))def toptexts(query, texts, index, n=10):    """   调试fnc以帮助检查最高n最相似的文档(根据a    相似索引索引)，以查看它们是否与查询相关。    文本是任何可以为每个文档返回深刻见解的对象    通过文本检验，如全文或代码片段。    返回一个3元组的列表(检验，doc与查询的相似性，文本检验)。    """    sims = index[query]  # perform a similarity query against the corpus    sims = sorted(enumerate(sims), key=lambda item: -item[1])    result = []    for topid, topcosine in sims[:n]:  # only consider top-n most similar docs        result.append((topid, topcosine, texts[topid]))    return resultdef randfname(prefix='gensim'):    randpart = hex(random.randint(0, 0xffffff))[2:]    return os.path.join(tempfile.gettempdir(), prefix + randpart)def upload_chunked(server, docs, chunksize=1000, preprocess=None):    """    内存友好的将文档上载到SimServer(或Pyro SimServer代理)。    使用该函数来训练或索引大型集合—避免发送    整个语料库作为一个单独的Pyro内存对象。这些文件    将会被发送到较小的块中，每一个都是纯文档大小的文档。    """    start = 0    for chunk in grouper(docs, chunksize):        end = start + len(chunk)        logger.info("uploading documents %i-%i" % (start, end - 1))        if preprocess is not None:            pchunk = []            for doc in chunk:                doc['tokens'] = preprocess(doc['text'])                del doc['text']                pchunk.append(doc)            chunk = pchunk        server.buffer(chunk)        start = enddef getNS(host=None, port=None, broadcast=True, hmac_key=None):    """    Return a Pyro name server proxy.    """    import Pyro4    try:        return Pyro4.locateNS(host, port, broadcast, hmac_key)    except Pyro4.errors.NamingError:        raise RuntimeError("Pyro name server not found")def pyro_daemon(name, obj, random_suffix=False, ip=None, port=None, ns_conf={}):    """    使用名称服务器注册对象(如果不运行，启动名称服务器    但是)并且阻塞直到该守护进程终止。该对象在下面注册    如果设置了随机后缀，那么名字或名字加上一些随机后缀。    """    if random_suffix:        name += '.' + hex(random.randint(0, 0xffffff))[2:]    import Pyro4    with getNS(**ns_conf) as ns:        with Pyro4.Daemon(ip or get_my_ip(), port or 0) as daemon:            # register server for remote access            uri = daemon.register(obj, name)            ns.remove(name)            ns.register(name, uri)            logger.info("%s registered with nameserver (URI '%s')" % (name, uri))            daemon.requestLoop()def has_pattern():    """    函数返回一个标志，指示是否安装了模式    """    try:        from pattern.en import parse        return True    except ImportError:        return Falsedef lemmatize(        content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False,        stopwords=frozenset(), min_length=2, max_length=15):    """   这个函数只有在安装了可选的“模式”包时才可用。    使用来自模式的英语lemmatizer来提取utf-8编码的令牌    其基本形式=引理,如。“是”，“是”，“是”等等。    这是一个更聪明的版本，将单词上下文考虑在内。    在缺省情况下只考虑名词、动词、形容词和副词(=所有其他引理被丢弃)。    >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')    ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']    >>> lemmatize('The study ranks high.')    ['study/NN', 'rank/VB', 'high/JJ']    >>> lemmatize('The ranks study hard.')    ['rank/NN', 'study/VB', 'hard/RB']    """    if not has_pattern():        raise ImportError("Pattern library is not installed. Pattern library is needed in order to use lemmatize function")    from pattern.en import parse    if light:        import warnings        warnings.warn("The light flag is no longer supported by pattern.")    # tokenization in `pattern` is weird; it gets thrown off by non-letters,    # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little    # FIXME this throws away all fancy parsing cues, including sentence structure,    # abbreviations etc.    content = u(' ').join(tokenize(content, lower=True, errors='ignore'))    parsed = parse(content, lemmata=True, collapse=False)    result = []    for sentence in parsed:        for token, tag, _, _, lemma in sentence:            if min_length <= len(lemma) <= max_length and not lemma.startswith('_') and lemma not in stopwords:                if allowed_tags.match(tag):                    lemma += "/" + tag[:2]                    result.append(lemma.encode('utf8'))    return resultdef mock_data_row(dim=1000, prob_nnz=0.5, lam=1.0):    """        创建一个随机的gensim稀疏矢量。每个坐标都是非0的        概率检验，每个非零的坐标值都是从        Poisson的一个参数，等于拉姆。    """    nnz = np.random.uniform(size=(dim,))    data = [(i, float(np.random.poisson(lam=lam) + 1.0))            for i in xrange(dim) if nnz[i] < prob_nnz]    return datadef mock_data(n_items=1000, dim=1000, prob_nnz=0.5, lam=1.0):    """   创建一个随机的gensimstyle语料库，作为一个列表(int，float)元组，    用作模拟语料库。    """    data = [mock_data_row(dim=dim, prob_nnz=prob_nnz, lam=lam)            for _ in xrange(n_items)]    return datadef prune_vocab(vocab, min_reduce, trim_rule=None):    """    从vocab字典中删除所有条目，计数小于minreduce。    修改vocab，返回所有被修剪的计数的和。    """    result = 0    old_len = len(vocab)    for w in list(vocab):  # make a copy of dict's keys        if not keep_vocab_item(w, vocab[w], min_reduce, trim_rule):  # vocab[w] <= min_reduce:            result += vocab[w]            del vocab[w]    logger.info("pruned out %i tokens with count <=%i (before %i, after %i)",                old_len - len(vocab), min_reduce, old_len, len(vocab))    return resultdef qsize(queue):    """返回可用的(近似)队列大小;不(OS X)-1。"""    try:        return queue.qsize()    except NotImplementedError:        # OS X doesn't support qsize        return -1RULE_DEFAULT = 0RULE_DISCARD = 1RULE_KEEP = 2def keep_vocab_item(word, count, min_count, trim_rule=None):    default_res = count >= min_count    if trim_rule is None:        return default_res    else:        rule_res = trim_rule(word, count, min_count)        if rule_res == RULE_KEEP:            return True        elif rule_res == RULE_DISCARD:            return False        else:            return default_resdef check_output(stdout=subprocess.PIPE, *popenargs, **kwargs):    """    使用参数运行命令，并将输出作为字节字符串返回。    从Python 2.7中返回，因为它在stdlib中实现为纯Python。    >>> check_output(args=['/usr/bin/python', '--version'])    Python 2.6.2    Added extra KeyboardInterrupt handling    """    try:        logger.debug("COMMAND: %s %s", popenargs, kwargs)        process = subprocess.Popen(stdout=stdout, *popenargs, **kwargs)        output, unused_err = process.communicate()        retcode = process.poll()        if retcode:            cmd = kwargs.get("args")            if cmd is None:                cmd = popenargs[0]            error = subprocess.CalledProcessError(retcode, cmd)            error.output = output            raise error        return output    except KeyboardInterrupt:        process.terminate()        raisedef sample_dict(d, n=10, use_random=True):    """    从字典d中选择n项，并将它们作为一个列表返回。    如果使用随机选择的话，这些项目是随机选择的。    根据自然的“迭代”的说法。    """    selected_keys = random.sample(list(d), min(len(d), n)) if use_random else itertools.islice(iterkeys(d), n)    return [(key, d[key]) for key in selected_keys]def strided_windows(ndarray, window_size):    """    产生一个numpy。窗户的ndarray，从一个滑动窗口    >>> strided_windows(np.arange(5), 2)    array([[0, 1],           [1, 2],           [2, 3],           [3, 4]])    >>> strided_windows(np.arange(10), 5)    array([[0, 1, 2, 3, 4],           [1, 2, 3, 4, 5],           [2, 3, 4, 5, 6],           [3, 4, 5, 6, 7],           [4, 5, 6, 7, 8],           [5, 6, 7, 8, 9]])    Args:        ndarray: either a numpy.ndarray or something that can be converted into one.        window_size: sliding window size.    Returns:        numpy.ndarray of the subsequences produced by sliding a window of the given size over        the `ndarray`. Since this uses striding, the individual arrays are views rather than        copies of `ndarray`. Changes to one view modifies the others and the original.    """    ndarray = np.asarray(ndarray)    if window_size == ndarray.shape[0]:        return np.array([ndarray])    elif window_size > ndarray.shape[0]:        return np.ndarray((0, 0))    stride = ndarray.strides[0]    return np.lib.stride_tricks.as_strided(        ndarray, shape=(ndarray.shape[0] - window_size + 1, window_size),        strides=(stride, stride))def iter_windows(texts, window_size, copy=False, ignore_below_size=True, include_doc_num=False):    """使用窗口大小的滑动窗口在给定的文本上生成一个生成器。        所生成的窗口是一些文本的子序列的视图。使用深拷贝        相反,通过“复制= True”。        参数:        文本:字符串句子的列表。        窗口大小:滑动窗口的大小。        复制:错误使用文本的视图(默认)或True来生成深度副本。        忽略大小:忽略至少窗口大小(默认行为)的文档。        如果是假的，窗口大小的文档将作为完整的文档被产生。    """    for doc_num, document in enumerate(texts):        for window in _iter_windows(document, window_size, copy, ignore_below_size):            if include_doc_num:                yield (doc_num, window)            else:                yield windowdef _iter_windows(document, window_size, copy=False, ignore_below_size=True):    doc_windows = strided_windows(document, window_size)    if doc_windows.shape[0] == 0:        if not ignore_below_size:            yield document.copy() if copy else document    else:        for doc_window in doc_windows:            yield doc_window.copy() if copy else doc_window
阅读全文
0 0