settings.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. # -*- coding: utf-8 -*-
  2. # Scrapy settings for hg3535 project
  3. #
  4. # For simplicity, this file contains only settings considered important or
  5. # commonly used. You can find more settings consulting the documentation:
  6. #
  7. # https://doc.scrapy.org/en/latest/topics/settings.html
  8. # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
  9. # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  10. BOT_NAME = 'hg3535'
  11. SPIDER_MODULES = ['hg3535.spiders']
  12. NEWSPIDER_MODULE = 'hg3535.spiders'
  13. # Crawl responsibly by identifying yourself (and your website) on the user-agent
  14. USER_AGENT = {
  15. 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0'
  16. }
  17. # Obey robots.txt rules
  18. ROBOTSTXT_OBEY = False
  19. # Configure maximum concurrent requests performed by Scrapy (default: 16)
  20. CONCURRENT_REQUESTS = 16
  21. # Configure a delay for requests for the same website (default: 0)
  22. # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
  23. # See also autothrottle settings and docs
  24. # DOWNLOAD_DELAY = 3
  25. DOWNLOAD_DELAY = 0
  26. # The download delay setting will honor only one of:
  27. CONCURRENT_REQUESTS_PER_DOMAIN = 16
  28. CONCURRENT_REQUESTS_PER_IP = 0
  29. # Disable cookies (enabled by default)
  30. # COOKIES_ENABLED = False
  31. # Disable Telnet Console (enabled by default)
  32. # TELNETCONSOLE_ENABLED = False
  33. # Override the default request headers:
  34. # DEFAULT_REQUEST_HEADERS = {
  35. # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  36. # 'Accept-Language': 'en',
  37. # }
  38. # Enable or disable spider middlewares
  39. # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  40. # from scrapy_deltafetch
  41. # SPIDER_MIDDLEWARES = {
  42. # # 'scrapy_deltafetch.DeltaFetch': 100,
  43. # 'hg3535.middlewares.Hg3535SpiderMiddleware': 543
  44. # }
  45. # #
  46. # DELTAFETCH_ENABLED = True
  47. # Enable or disable downloader middlewares
  48. # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
  49. DOWNLOADER_MIDDLEWARES = {
  50. # 'hg3535.middlewares.Hg3535DownloaderMiddleware': 200,
  51. 'hg3535.middlewares.Hg3535timeoutDownloaderMiddleware': 200,
  52. 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 500,
  53. # 'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 502,
  54. }
  55. # Enable or disable extensions
  56. # See https://doc.scrapy.org/en/latest/topics/extensions.html
  57. # EXTENSIONS = {
  58. # 'scrapy.extensions.telnet.TelnetConsole': None,
  59. # }
  60. # Configure item pipeline
  61. # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
  62. ITEM_PIPELINES = {}
  63. # Enable and configure the AutoThrottle extension (disabled by default)
  64. # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
  65. # AUTOTHROTTLE_ENABLED = True
  66. # The initial download delay
  67. # AUTOTHROTTLE_START_DELAY = 5
  68. # The maximum download delay to be set in case of high latencies
  69. # AUTOTHROTTLE_MAX_DELAY = 60
  70. # The average number of requests Scrapy should be sending in parallel to
  71. # each remote server
  72. # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
  73. # Enable showing throttling stats for every response received:
  74. # AUTOTHROTTLE_DEBUG = False
  75. # Enable and configure HTTP caching (disabled by default)
  76. # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
  77. # HTTPCACHE_ENABLED = True
  78. # HTTPCACHE_EXPIRATION_SECS = 0
  79. # HTTPCACHE_DIR = 'httpcache'
  80. # HTTPCACHE_IGNORE_HTTP_CODES = []
  81. # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
  82. # DOWNLOAD_TIMEOUT = 180
  83. REACTOR_THREADPOOL_MAXSIZE = 40
  84. # LOG_LEVEL = 'INFO'
  85. COOKIES_ENABLED = False
  86. RETRY_ENABLED = False
  87. DOWNLOAD_TIMEOUT = 10
  88. REDIRECT_ENABLED = False
  89. CONCURRENT_ITEMS = 1000
  90. # SCHEDULER_PERSIST = False # 是否在关闭时候保留原来的调度器和去重记录,True=保留,False=清空
  91. # SCHEDULER_FLUSH_ON_START = False
  92. # TELNETCONSOLE_PORT = None
  93. # TELNETCONSOLE_ENABLED=False
  94. # AttributeError: 'TelnetConsole' object has no attribute 'port'
  95. # RETRY_ENABLED = True
  96. # RETRY_TIMES = 2
  97. # RETRY_HTTP_CODES = [500, 502, 503, 504, 400, 408]
  98. # LOG_LEVEL = 'DEBUG'
  99. # SCHEDULER_PERSIST = False
  100. # LOG_FILE = './log/'
  101. # upMatch
  102. # POST_HOST = 'localhost'
  103. # POST_DATABASE = 'kaiyou'
  104. # POST_USER = 'kaiyou'
  105. # POST_PORT = '10432'
  106. # POST_PASSWORD = '123456'
  107. # SCHEDULER = "scrapy_redis.scheduler.Scheduler"
  108. # DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
  109. # SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat"
  110. # #不要清理redis队列,允许暂停/恢复抓取。
  111. # SCHEDULER_PERSIST = False
  112. # #使用优先级队列安排请求。(默认)
  113. # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
  114. # REDIS_HOST = '192.168.2.200'
  115. # REDIS_PORT = 6379
  116. # REDIS_PARAMS = {'password': 123456, 'db': 1}
  117. # 随机等待
  118. # RANDOMIZE_DOWNLOAD_DELAY = True
  119. # # AutoThrottle扩展
  120. # AUTOTHROTTLE_ENABLED = True
  121. # AUTOTHROTTLE_DEBUG = True
  122. # AUTOTHROTTLE_TARGET_CONCURRENCY = 0.25
  123. # AUTOTHROTTLE_MAX_DELAY = 5
  124. # SCHEDULER = "scrapy_redis.scheduler.Scheduler"
  125. # DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
  126. # REDIS_URL = 'redis://:123456@192.168.2.200:6379'
  127. # SCHEDULER_PERSIST = False
  128. # mongo
  129. M_HOST = '192.168.2.200'
  130. M_POST = 27017
  131. M_USER = 'kaiyou'
  132. M_DB = 'kaiyou'
  133. M_PASSWORD = 'kaiyou'
  134. # redis
  135. R_HOST = '192.168.2.200'
  136. R_POST = 6379
  137. R_DB = 1
  138. R_PASSWORD = 123456
  139. # token
  140. T_USER = 'python'
  141. T_PASSWORD = 'python888'
  142. T_URL = 'http://stadmin.bocai108.com/getToken'
  143. # 域名
  144. POST_URL = 'http://stadmin.bocai108.com:19093'