scikit-learn中load_files函数源码解析

来源：互联网发布：淘宝客推广公司编辑：程序博客网时间：2024/06/05 10:14
# load_files函数解析def load_files(container_path, description=None, categories=None,               load_content=True, shuffle=True, encoding=None,               decode_error='strict', random_state=0):    # target存放的是所有文档的类标签数字编号，filenames存放的是文档的完全路径名，他和target一一对应，target_name存放的类标签名    target = []    target_names = []    filenames = []    # folders：是categories的所有文件名    # listdir以list的形式返回container_path路径下的所有子路径名    folders = [f for f in sorted(listdir(container_path))               if isdir(join(container_path, f))]    # 如果categories不为none,则过滤掉非categories的路径    if categories is not None:        folders = [f for f in folders if f in categories]    # enumerate:枚举，它允许我们遍历数据并⾃动计数（从0开始技术：0，1,2...）    for label, folder in enumerate(folders):        target_names.append(folder)        folder_path = join(container_path, folder)# documents为当前路径下所有txt文件名（存放在list中）        documents = [join(folder_path, d)                     for d in sorted(listdir(folder_path))]        target.extend(len(documents) * [label]) # 形如extends([1,1,1,1])        filenames.extend(documents)# filenames是list嵌套list    # list转换为array    # convert to array for fancy indexing    filenames = np.array(filenames)# shape(categories个数, 当前路径下filename的个数)     target = np.array(target)# shape(categories个数, 当前路径下filename的个数)     if shuffle:        random_state = check_random_state(random_state)        indices = np.arange(filenames.shape[0])        random_state.shuffle(indices)        filenames = filenames[indices]        target = target[indices]    if load_content:        data = []        for filename in filenames:            with open(filename, 'rb') as f:                data.append(f.read())        if encoding is not None:            data = [d.decode(encoding, decode_error) for d in data]        return Bunch(data=data,                     filenames=filenames,                     target_names=target_names,                     target=target,                     DESCR=description)    return Bunch(filenames=filenames,                 target_names=target_names,                 target=target,                 DESCR=description)
1 0