scrapy缺省设置

来源:互联网 发布:大数据工程师年薪 编辑:程序博客网 时间:2024/06/13 09:46

BOT_NAME = ‘scrapybot’

CLOSESPIDER_TIMEOUT = 0 
CLOSESPIDER_PAGECOUNT = 0 
CLOSESPIDER_ITEMCOUNT = 0 
CLOSESPIDER_ERRORCOUNT = 0

COMMANDS_MODULE = ”

CONCURRENT_ITEMS = 100

CONCURRENT_REQUESTS = 16 
CONCURRENT_REQUESTS_PER_DOMAIN = 8 
CONCURRENT_REQUESTS_PER_IP = 0

COOKIES_ENABLED = True 
COOKIES_DEBUG = False

DEFAULT_ITEM_CLASS = ‘scrapy.item.Item’

DEFAULT_REQUEST_HEADERS = { 
‘Accept’: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8′, 
‘Accept-Language’: ‘en’, 
}

DEPTH_LIMIT = 0 
DEPTH_STATS = True 
DEPTH_PRIORITY = 0

DNSCACHE_ENABLED = True

DOWNLOAD_DELAY = 0

DOWNLOAD_HANDLERS = {} 
DOWNLOAD_HANDLERS_BASE = { 
‘file’: ‘scrapy.core.downloader.handlers.file.FileDownloadHandler’, 
‘http’: ‘scrapy.core.downloader.handlers.http.HttpDownloadHandler’, 
‘https’: ‘scrapy.core.downloader.handlers.http.HttpDownloadHandler’, 
‘s3′: ‘scrapy.core.downloader.handlers.s3.S3DownloadHandler’, 
}

DOWNLOAD_TIMEOUT = 180      # 3mins

DOWNLOADER_DEBUG = False

DOWNLOADER_HTTPCLIENTFACTORY = ‘scrapy.core.downloader.webclient.ScrapyHTTPClientFactory’ 
DOWNLOADER_CLIENTCONTEXTFACTORY = ‘scrapy.core.downloader.webclient.ScrapyClientContextFactory’

DOWNLOADER_MIDDLEWARES = {}

DOWNLOADER_MIDDLEWARES_BASE = { 
# Engine side 
‘scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware’: 100, 
‘scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware’: 300, 
‘scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware’: 350, 
‘scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware’: 400, 
‘scrapy.contrib.downloadermiddleware.retry.RetryMiddleware’: 500, 
‘scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware’: 550, 
‘scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware’: 600, 
‘scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware’: 700, 
‘scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware’: 750, 
‘scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware’: 800, 
‘scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware’: 830,
‘scrapy.contrib.downloadermiddleware.stats.DownloaderStats’: 850, 
‘scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware’: 900, 
# Downloader side 
}

DOWNLOADER_STATS = True

DUPEFILTER_CLASS = ‘scrapy.dupefilter.RFPDupeFilter’

try: 
EDITOR = os.environ['EDITOR'] 
except KeyError: 
if sys.platform == ‘win32′: 
EDITOR = ‘%s -m idlelib.idle’ 
else: 
EDITOR = ‘vi’

EXTENSIONS = {}

EXTENSIONS_BASE = { 
‘scrapy.contrib.corestats.CoreStats’: 0, 
‘scrapy.webservice.WebService’: 0, 
‘scrapy.telnet.TelnetConsole’: 0, 
‘scrapy.contrib.memusage.MemoryUsage’: 0, 
‘scrapy.contrib.memdebug.MemoryDebugger’: 0, 
‘scrapy.contrib.closespider.CloseSpider’: 0, 
‘scrapy.contrib.feedexport.FeedExporter’: 0, 
‘scrapy.contrib.logstats.LogStats’: 0, 
‘scrapy.contrib.spiderstate.SpiderState’: 0, 
‘scrapy.contrib.throttle.AutoThrottle’: 0, 
}

FEED_URI = None 
FEED_URI_PARAMS = None # a function to extend uri arguments 
FEED_FORMAT = ‘jsonlines’ 
FEED_STORE_EMPTY = False 
FEED_STORAGES = {} 
FEED_STORAGES_BASE = { 
”: ‘scrapy.contrib.feedexport.FileFeedStorage’, 
‘file’: ‘scrapy.contrib.feedexport.FileFeedStorage’, 
‘stdout’: ‘scrapy.contrib.feedexport.StdoutFeedStorage’, 
‘s3′: ‘scrapy.contrib.feedexport.S3FeedStorage’, 
‘ftp’: ‘scrapy.contrib.feedexport.FTPFeedStorage’, 

FEED_EXPORTERS = {} 
FEED_EXPORTERS_BASE = { 
‘json’: ‘scrapy.contrib.exporter.JsonItemExporter’, 
‘jsonlines’: ‘scrapy.contrib.exporter.JsonLinesItemExporter’, 
‘csv’: ‘scrapy.contrib.exporter.CsvItemExporter’, 
‘xml’: ‘scrapy.contrib.exporter.XmlItemExporter’, 
‘marshal’: ‘scrapy.contrib.exporter.MarshalItemExporter’, 
‘pickle’: ‘scrapy.contrib.exporter.PickleItemExporter’, 
}

HTTPCACHE_ENABLED = False 
HTTPCACHE_DIR = ‘httpcache’ 
HTTPCACHE_IGNORE_MISSING = False 
HTTPCACHE_STORAGE = ‘scrapy.contrib.httpcache.DbmCacheStorage’ 
HTTPCACHE_EXPIRATION_SECS = 0 
HTTPCACHE_IGNORE_HTTP_CODES = [] 
HTTPCACHE_IGNORE_SCHEMES = ['file'] 
HTTPCACHE_DBM_MODULE = ‘anydbm’

ITEM_PROCESSOR = ‘scrapy.contrib.pipeline.ItemPipelineManager’

# Item pipelines are typically set in specific commands settings 
ITEM_PIPELINES = []

LOG_ENABLED = True 
LOG_ENCODING = ‘utf-8′ 
LOG_FORMATTER = ‘scrapy.logformatter.LogFormatter’ 
LOG_STDOUT = False 
LOG_LEVEL = ‘DEBUG’ 
LOG_FILE = None

LOG_UNSERIALIZABLE_REQUESTS = False

LOGSTATS_INTERVAL = 60.0

MAIL_DEBUG = False 
MAIL_HOST = ‘localhost’ 
MAIL_PORT = 25 
MAIL_FROM = ‘scrapy@localhost’ 
MAIL_PASS = None 
MAIL_USER = None

MEMDEBUG_ENABLED = False        # enable memory debugging 
MEMDEBUG_NOTIFY = []            # send memory debugging report by mail at engine shutdown

MEMUSAGE_ENABLED = False 
MEMUSAGE_LIMIT_MB = 0 
MEMUSAGE_NOTIFY_MAIL = [] 
MEMUSAGE_REPORT = False 
MEMUSAGE_WARNING_MB = 0

NEWSPIDER_MODULE = ”

RANDOMIZE_DOWNLOAD_DELAY = True

REDIRECT_ENABLED = True 
REDIRECT_MAX_METAREFRESH_DELAY = 100 
REDIRECT_MAX_TIMES = 20 # uses Firefox default setting 
REDIRECT_PRIORITY_ADJUST = +2

REFERER_ENABLED = True

RETRY_ENABLED = True 
RETRY_TIMES = 2 # initial response + 2 retries = 3 requests 
RETRY_HTTP_CODES = [500, 503, 504, 400, 408] 
RETRY_PRIORITY_ADJUST = -1

ROBOTSTXT_OBEY = False

SCHEDULER = ‘scrapy.core.scheduler.Scheduler’ 
SCHEDULER_DISK_QUEUE = ‘scrapy.squeue.PickleLifoDiskQueue’ 
SCHEDULER_MEMORY_QUEUE = ‘scrapy.squeue.LifoMemoryQueue’

SPIDER_MANAGER_CLASS = ‘scrapy.spidermanager.SpiderManager’

SPIDER_MIDDLEWARES = {}

SPIDER_MIDDLEWARES_BASE = { 
# Engine side 
‘scrapy.contrib.spidermiddleware.httperror.HttpErrorMiddleware’: 50, 
‘scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware’: 500, 
‘scrapy.contrib.spidermiddleware.referer.RefererMiddleware’: 700, 
‘scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware’: 800, 
‘scrapy.contrib.spidermiddleware.depth.DepthMiddleware’: 900, 
# Spider side 
}

SPIDER_MODULES = []

STATS_CLASS = ‘scrapy.statscol.MemoryStatsCollector’ 
STATS_DUMP = True

STATSMAILER_RCPTS = []

TEMPLATES_DIR = abspath(join(dirname(__file__), ‘..’, ‘templates’))

URLLENGTH_LIMIT = 2083

USER_AGENT = ‘Scrapy/%s (+http://scrapy.org)’ % __import__(‘scrapy’).__version__

TELNETCONSOLE_ENABLED = 1 
TELNETCONSOLE_PORT = [6023, 6073] 
TELNETCONSOLE_HOST = ’0.0.0.0′

WEBSERVICE_ENABLED = True 
WEBSERVICE_LOGFILE = None 
WEBSERVICE_PORT = [6080, 7030] 
WEBSERVICE_HOST = ’0.0.0.0′ 
WEBSERVICE_RESOURCES = {} 
WEBSERVICE_RESOURCES_BASE = { 
‘scrapy.contrib.webservice.crawler.CrawlerResource’: 1, 
‘scrapy.contrib.webservice.enginestatus.EngineStatusResource’: 1, 
‘scrapy.contrib.webservice.stats.StatsResource’: 1, 
}

SPIDER_CONTRACTS = {} 
SPIDER_CONTRACTS_BASE = { 
‘scrapy.contracts.default.UrlContract’ : 1, 
‘scrapy.contracts.default.ReturnsContract’: 2, 
‘scrapy.contracts.default.ScrapesContract’: 3, 
}

0 0
原创粉丝点击
热门问题 老师的惩罚 人脸识别 我在镇武司摸鱼那些年 重生之率土为王 我在大康的咸鱼生活 盘龙之生命进化 天生仙种 凡人之先天五行 春回大明朝 姑娘不必设防,我是瞎子 谷歌商店下载东西慢怎么办 买家说少发货了怎么办 人肉代购被海关扣了怎么办 韩国代购被海关扣了怎么办 爱奇艺开通自动续费忘了账号怎么办 小米手机云储存空间不足怎么办 路由器被黑了打不开网页怎么办 致人轻伤跑了怎么办 轻伤对方要30万怎么办 老公用老婆的钱怎么办 想注册个公司要怎么办 域名续费不知道找谁怎么办 代收快递弄丢了怎么办 货到付款的快递人不在怎么办 快递送货上门人不在怎么办 ems快递签收人不在怎么办 快递被别人取了怎么办 怎么办快递宗和收发点 快递电话写错了怎么办 网上买沙发想退货怎么办 买的电脑想退货怎么办 买了衣服想退货怎么办 天猫买药审核通过后不要了怎么办 京东维修无发票怎么办 京东维修没有发票怎么办 苹果6s外音没了怎么办 苹果6splus开不了机怎么办 顺丰快递寄件填错收件人地址怎么办 收快递电话换了怎么办 顺丰快递没人收怎么办 网购东西没收到怎么办 中通快递没收到怎么办 快递员不给验货怎么办 顺丰验货不要了怎么办 闲鱼买家掉包了怎么办 闲鱼正在退款中怎么办 拒收货物卖家拒绝退款怎么办 货物没问题淘宝卖家拒收怎么办 头发稀少长的慢怎么办 没满16岁怎么办银行卡 网上买东西手机号填错了怎么办