import sys
reload(sys)
sys.setdefaultencoding("utf-8")
from scrapy.spiders import Spider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from piccrawler.items import PiccrawlerItem
import re
import urllib
class DoubanSpider(Spider):
name = "douban"
allowed_domains = ["movie.douban.com"]
start_urls = []
def start_requests(self):
file_object = open('movie_name.txt','r')
try:
url_head = "http://movie.douban.com/subject_search?search_text="
for line in file_object:
self.start_urls.append(url_head + line)
for url in self.start_urls:
yield self.make_requests_from_url(url)
finally:
file_object.close()
def parse(self, response):
hxs = HtmlXPathSelector(response)
movie_link = hxs.select('//*[@id="content"]/div/div[1]/div[2]/table[1]/tr/td[1]/a/@href').extract()
if movie_link:
yield Request(movie_link[0],callback=self.parse_item)
def parse_item(self,response):
hxs = HtmlXPathSelector(response)
movie_picture = hxs.select('//*[@id="mainpic"]/a/img/@src').extract()
item = PiccrawlerItem()
item['movie_picture'] = ''.join(movie_picture).strip()
movie_id_file = open('movie_id.txt','r')
try:
for line in movie_id_file:
item['movie_id'] = line.strip()
if movie_picture:
urllib.urlretrieve(movie_picture[0].strip(),'pictures/' + line.strip() + '.jpg')
finally:
movie_id_file.close()
yield item