Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
R
RespoDL
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Package registry
Model registry
Operate
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Wenchao Chen
RespoDL
Commits
3eb423b1
Commit
3eb423b1
authored
8 years ago
by
Wenchao Chen
Browse files
Options
Downloads
Patches
Plain Diff
Adding Downloader for ChongqingshiSpider
parent
54e58d4a
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
RespoDL/spiders/ChongqingshiSpider.py
+61
-0
61 additions, 0 deletions
RespoDL/spiders/ChongqingshiSpider.py
RespoDL/spiders/HuaianshiSpider.py
+1
-1
1 addition, 1 deletion
RespoDL/spiders/HuaianshiSpider.py
with
62 additions
and
1 deletion
RespoDL/spiders/ChongqingshiSpider.py
0 → 100644
+
61
−
0
View file @
3eb423b1
# Author: 陈闻超
# Version: 1.0
import
re
from
scrapy
import
Request
from
scrapy.http
import
TextResponse
from
RespoDL.spiders.DownloadSpider
import
DownloadSpider
from
selenium
import
webdriver
from
pyvirtualdisplay
import
Display
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.support.ui
import
WebDriverWait
from
selenium.webdriver.support
import
expected_conditions
as
EC
from
selenium.common.exceptions
import
TimeoutException
class
ChongqingshiSpider
(
DownloadSpider
):
name
=
'
chongqingshi
'
start_urls
=
[
'
http://www.cq.gov.cn/publicmail/citizen/ReleaseMailListDistrict.aspx
'
]
detail_url
=
'
http://www.cq.gov.cn/publicmail/citizen/
'
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
(
ChongqingshiSpider
,
self
).
__init__
(
*
args
,
**
kwargs
)
self
.
display
=
Display
(
visible
=
0
,
size
=
(
800
,
600
))
self
.
display
.
start
()
self
.
download_delay
=
0.25
self
.
driver
=
webdriver
.
Chrome
()
self
.
driver
.
wait
=
WebDriverWait
(
self
.
driver
,
100
)
def
parse
(
self
,
response
):
self
.
driver
.
get
(
response
.
url
)
while
True
:
page
=
TextResponse
(
response
.
url
,
body
=
self
.
driver
.
page_source
,
encoding
=
'
utf-8
'
)
cur_page
=
page
.
css
(
'
input#txtPage::attr(value)
'
).
extract_first
()
last_page
=
page
.
css
(
"
table.table_list_rq tr td::text
"
).
extract
()[
2
]
r
=
re
.
compile
(
"
[^\d]
"
)
last_page
=
r
.
sub
(
""
,
last_page
)
filename
=
'
downloads/
'
+
self
.
name
+
'
/page_
'
+
cur_page
+
'
/
'
+
cur_page
self
.
write_file
(
filename
,
page
.
text
)
issues
=
page
.
css
(
"
table#dgrdMail tr td a::attr(href)
"
).
extract
()
for
issue
in
issues
:
url
=
self
.
detail_url
+
issue
yield
Request
(
url
,
meta
=
{
'
page
'
:
cur_page
},
callback
=
self
.
parse_detail
)
if
int
(
cur_page
)
<
int
(
last_page
):
next_page
=
self
.
driver
.
find_element_by_css_selector
(
'
#btnNext
'
)
try
:
next_page
.
click
()
except
Exception
:
self
.
log
(
'
Button could not be found
'
)
else
:
self
.
log
(
'
Reached last page
'
)
break
def
parse_detail
(
self
,
response
):
page
=
response
.
request
.
meta
[
'
page
'
]
filename
=
'
downloads/
'
+
self
.
name
+
'
/page_
'
+
page
+
'
/
'
+
response
.
url
.
split
(
"
?
"
)[
-
1
]
self
.
write_file
(
filename
,
response
.
text
)
\ No newline at end of file
This diff is collapsed.
Click to expand it.
RespoDL/spiders/HuaianshiSpider.py
+
1
−
1
View file @
3eb423b1
...
@@ -6,7 +6,7 @@ import re
...
@@ -6,7 +6,7 @@ import re
from
scrapy
import
Request
from
scrapy
import
Request
from
scrapy
import
FormRequest
from
scrapy
import
FormRequest
from
RespoDL.spiders
import
DownloadSpider
from
RespoDL.spiders
.DownloadSpider
import
DownloadSpider
class
HuaianshiSpider
(
DownloadSpider
):
class
HuaianshiSpider
(
DownloadSpider
):
name
=
'
huaianshi
'
name
=
'
huaianshi
'
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment