diff --git a/RespoDL/spiders/ChongqingshiSpider.py b/RespoDL/spiders/ChongqingshiSpider.py new file mode 100644 index 0000000000000000000000000000000000000000..68c754a87681f62553bc428982fe6ddf7d4d079c --- /dev/null +++ b/RespoDL/spiders/ChongqingshiSpider.py @@ -0,0 +1,61 @@ +# Author: 陈闻超 +# Version: 1.0 + +import re + +from scrapy import Request +from scrapy.http import TextResponse + +from RespoDL.spiders.DownloadSpider import DownloadSpider + +from selenium import webdriver +from pyvirtualdisplay import Display +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.common.exceptions import TimeoutException + +class ChongqingshiSpider(DownloadSpider): + name = 'chongqingshi' + start_urls = ['http://www.cq.gov.cn/publicmail/citizen/ReleaseMailListDistrict.aspx'] + detail_url = 'http://www.cq.gov.cn/publicmail/citizen/' + + def __init__(self, *args, **kwargs): + super(ChongqingshiSpider, self).__init__(*args, **kwargs) + self.display = Display(visible=0, size=(800, 600)) + self.display.start() + self.download_delay = 0.25 + self.driver = webdriver.Chrome() + self.driver.wait = WebDriverWait(self.driver, 100) + + def parse(self, response): + self.driver.get(response.url) + while True: + page = TextResponse(response.url, body=self.driver.page_source, encoding='utf-8') + cur_page = page.css('input#txtPage::attr(value)').extract_first() + last_page = page.css("table.table_list_rq tr td::text").extract()[2] + r = re.compile("[^\d]") + last_page = r.sub("", last_page) + + filename = 'downloads/' + self.name + '/page_' + cur_page + '/' + cur_page + self.write_file(filename, page.text) + + issues = page.css("table#dgrdMail tr td a::attr(href)").extract() + for issue in issues: + url = self.detail_url + issue + yield Request(url, meta={'page': cur_page}, callback=self.parse_detail) + + if int(cur_page) < int(last_page): + next_page = self.driver.find_element_by_css_selector('#btnNext') + try: + next_page.click() + except Exception: + self.log('Button could not be found') + else: + self.log('Reached last page') + break + + def parse_detail(self, response): + page = response.request.meta['page'] + filename = 'downloads/' + self.name + '/page_' + page + '/' + response.url.split("?")[-1] + self.write_file(filename, response.text) \ No newline at end of file diff --git a/RespoDL/spiders/HuaianshiSpider.py b/RespoDL/spiders/HuaianshiSpider.py index 07f8a1c86035cb0ad544806c2bec3e2c98f86dd7..b1b5bde2843c737aa58144d21c649449d8402dfe 100644 --- a/RespoDL/spiders/HuaianshiSpider.py +++ b/RespoDL/spiders/HuaianshiSpider.py @@ -6,7 +6,7 @@ import re from scrapy import Request from scrapy import FormRequest -from RespoDL.spiders import DownloadSpider +from RespoDL.spiders.DownloadSpider import DownloadSpider class HuaianshiSpider(DownloadSpider): name = 'huaianshi'