Skip to content
Snippets Groups Projects
Commit 3eb423b1 authored by Wenchao Chen's avatar Wenchao Chen
Browse files

Adding Downloader for ChongqingshiSpider

parent 54e58d4a
No related branches found
No related tags found
No related merge requests found
# Author: 陈闻超
# Version: 1.0
import re
from scrapy import Request
from scrapy.http import TextResponse
from RespoDL.spiders.DownloadSpider import DownloadSpider
from selenium import webdriver
from pyvirtualdisplay import Display
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
class ChongqingshiSpider(DownloadSpider):
name = 'chongqingshi'
start_urls = ['http://www.cq.gov.cn/publicmail/citizen/ReleaseMailListDistrict.aspx']
detail_url = 'http://www.cq.gov.cn/publicmail/citizen/'
def __init__(self, *args, **kwargs):
super(ChongqingshiSpider, self).__init__(*args, **kwargs)
self.display = Display(visible=0, size=(800, 600))
self.display.start()
self.download_delay = 0.25
self.driver = webdriver.Chrome()
self.driver.wait = WebDriverWait(self.driver, 100)
def parse(self, response):
self.driver.get(response.url)
while True:
page = TextResponse(response.url, body=self.driver.page_source, encoding='utf-8')
cur_page = page.css('input#txtPage::attr(value)').extract_first()
last_page = page.css("table.table_list_rq tr td::text").extract()[2]
r = re.compile("[^\d]")
last_page = r.sub("", last_page)
filename = 'downloads/' + self.name + '/page_' + cur_page + '/' + cur_page
self.write_file(filename, page.text)
issues = page.css("table#dgrdMail tr td a::attr(href)").extract()
for issue in issues:
url = self.detail_url + issue
yield Request(url, meta={'page': cur_page}, callback=self.parse_detail)
if int(cur_page) < int(last_page):
next_page = self.driver.find_element_by_css_selector('#btnNext')
try:
next_page.click()
except Exception:
self.log('Button could not be found')
else:
self.log('Reached last page')
break
def parse_detail(self, response):
page = response.request.meta['page']
filename = 'downloads/' + self.name + '/page_' + page + '/' + response.url.split("?")[-1]
self.write_file(filename, response.text)
\ No newline at end of file
...@@ -6,7 +6,7 @@ import re ...@@ -6,7 +6,7 @@ import re
from scrapy import Request from scrapy import Request
from scrapy import FormRequest from scrapy import FormRequest
from RespoDL.spiders import DownloadSpider from RespoDL.spiders.DownloadSpider import DownloadSpider
class HuaianshiSpider(DownloadSpider): class HuaianshiSpider(DownloadSpider):
name = 'huaianshi' name = 'huaianshi'
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment