diff --git a/.idea/RespoDL.iml b/.idea/RespoDL.iml new file mode 100644 index 0000000000000000000000000000000000000000..6711606311e2664bd835f92b5c114681d2e284f5 --- /dev/null +++ b/.idea/RespoDL.iml @@ -0,0 +1,11 @@ +<?xml version="1.0" encoding="UTF-8"?> +<module type="PYTHON_MODULE" version="4"> + <component name="NewModuleRootManager"> + <content url="file://$MODULE_DIR$" /> + <orderEntry type="inheritedJdk" /> + <orderEntry type="sourceFolder" forTests="false" /> + </component> + <component name="TestRunnerService"> + <option name="PROJECT_TEST_RUNNER" value="Unittests" /> + </component> +</module> \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000000000000000000000000000000000000..d8a4a57039c9f89cdb8470b6a385765b25190cea --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,14 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="ProjectLevelVcsManager" settingsEditedManually="false"> + <OptionsSetting value="true" id="Add" /> + <OptionsSetting value="true" id="Remove" /> + <OptionsSetting value="true" id="Checkout" /> + <OptionsSetting value="true" id="Update" /> + <OptionsSetting value="true" id="Status" /> + <OptionsSetting value="true" id="Edit" /> + <ConfirmationsSetting value="0" id="Add" /> + <ConfirmationsSetting value="0" id="Remove" /> + </component> + <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5.2 (/usr/bin/python3.5)" project-jdk-type="Python SDK" /> +</project> \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000000000000000000000000000000000000..7c7d0d6fbd989d7f1e5bde80d52f15f76ce852a1 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="ProjectModuleManager"> + <modules> + <module fileurl="file://$PROJECT_DIR$/.idea/RespoDL.iml" filepath="$PROJECT_DIR$/.idea/RespoDL.iml" /> + </modules> + </component> +</project> \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000000000000000000000000000000000000..94a25f7f4cb416c083d265558da75d457237d671 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="VcsDirectoryMappings"> + <mapping directory="$PROJECT_DIR$" vcs="Git" /> + </component> +</project> \ No newline at end of file diff --git a/RespoDL/__init__.py b/RespoDL/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/RespoDL/items.py b/RespoDL/items.py new file mode 100644 index 0000000000000000000000000000000000000000..13e6b38a1174fef24a6d7bda6ab0c9b3242dedb6 --- /dev/null +++ b/RespoDL/items.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# http://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class RespodlItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + file_urls = scrapy.Field() + files = scrapy.Field() + image_urls = scrapy.Field() + images = scrapy.Field() diff --git a/RespoDL/pipelines.py b/RespoDL/pipelines.py new file mode 100644 index 0000000000000000000000000000000000000000..4613fae70ccbbed284ddc16cdc7e08c3b17bdfdf --- /dev/null +++ b/RespoDL/pipelines.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html + + +class RespodlPipeline(object): + def process_item(self, item, spider): + return item diff --git a/RespoDL/settings.py b/RespoDL/settings.py new file mode 100644 index 0000000000000000000000000000000000000000..39edee2b7d0cd50baf43e47a3806e80a5d37b3b4 --- /dev/null +++ b/RespoDL/settings.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- +import os + +# Scrapy settings for RespoDL project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# http://doc.scrapy.org/en/latest/topics/settings.html +# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html +# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'RespoDL' + +SPIDER_MODULES = ['RespoDL.spiders'] +NEWSPIDER_MODULE = 'RespoDL.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36" + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'RespoDL.middlewares.MyCustomSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'RespoDL.middlewares.MyCustomDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'scrapy.pipelines.images.ImagesPipeline': 1, + 'scrapy.pipelines.files.FilesPipeline': 2 +} + +CUR_DIR = os.path.dirname(os.path.realpath(__file__)) + +FILES_STORE = os.path.join(CUR_DIR, '..', 'files') +FILES_URLS_FIELD = 'file_urls' +FILES_RESULT_FIELD = 'files' + +IMAGES_STORE = os.path.join(CUR_DIR, '..', 'images') +IMAGES_URLS_FIELD = 'image_urls' +IMAGES_RESULT_FIELD = 'images' + +# Enable and configure the AutoThrottle extension (disabled by default) +# See http://doc.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/RespoDL/spiders/DownloadSpider.py b/RespoDL/spiders/DownloadSpider.py new file mode 100644 index 0000000000000000000000000000000000000000..3ffe6deb3e724a6e14919b92e86e4a60f56486b2 --- /dev/null +++ b/RespoDL/spiders/DownloadSpider.py @@ -0,0 +1,13 @@ +# Author: 陈闻超 +# Version: 1.0 + +import scrapy +import codecs +import os + +class DownloadSpider(scrapy.Spider): + + def write_file(self, filename, source): + os.makedirs(os.path.dirname(filename), exist_ok=True) + with codecs.open(filename, 'wb', 'utf-8') as f: + f.write(source) \ No newline at end of file diff --git a/RespoDL/spiders/HuaianshiSpider.py b/RespoDL/spiders/HuaianshiSpider.py new file mode 100644 index 0000000000000000000000000000000000000000..07f8a1c86035cb0ad544806c2bec3e2c98f86dd7 --- /dev/null +++ b/RespoDL/spiders/HuaianshiSpider.py @@ -0,0 +1,45 @@ +# Author: 陈闻超 +# Version: 1.0 + +import re + +from scrapy import Request +from scrapy import FormRequest + +from RespoDL.spiders import DownloadSpider + +class HuaianshiSpider(DownloadSpider): + name = 'huaianshi' + start_urls = ['http://222.184.118.98/haportal/jsp/portal/allportalPublicityList.do'] + + def build_page_request(self, page): + return FormRequest("http://222.184.118.98/haportal/jsp/portal/allportalPublicityList.do", + formdata={'formOrigin': '0', + 'page': str(page)}, + headers={ + 'referer': 'http://222.184.118.98/haportal/jsp/portal/allportalPublicityList.do'}, + callback=self.parse,) + + def parse(self, response): + cur_page = response.css("div.mainbody table tr td span::text").extract_first().strip() + filename = 'downloads/' + self.name + '/page_' + cur_page + '/' + cur_page + self.write_file(filename, response.text) + + rows = response.css("div.list tr")[1:] + for row in rows: + cells = row.css("td") + url = cells.css("a p::attr('onclick')").extract_first() + url = re.search('\d+', url).group(0) + url = response.urljoin('allportalPublicityView.do?formId=' + url) + yield Request(url, meta={'page': cur_page}, callback=self.parse_detail) + + last_page = response.css("div.mainbody table tr td span::text").extract()[1].strip() + if int(cur_page) < int(last_page): + yield self.build_page_request(int(cur_page)+1) + + def parse_detail(self, response): + page = response.request.meta['page'] + page_source = response.text + page_source = re.sub(r'charset=gb2312\"', 'charset=\"gb2312\"', page_source) + filename = 'downloads/' + self.name + '/page_' + page + '/' + response.url.split("?")[-1] + self.write_file(filename, page_source) \ No newline at end of file diff --git a/scrapy.cfg b/scrapy.cfg new file mode 100644 index 0000000000000000000000000000000000000000..95c539c6be7cca69547f543f04f75f026e697e85 --- /dev/null +++ b/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.org/en/latest/deploy.html + +[settings] +default = RespoDL.settings + +[deploy] +#url = http://localhost:6800/ +project = RespoDL