Files
2015-04-22 18:24:28 +03:00

30 lines
907 B
Python

# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from euler_parser.items import EulerItem
class EulerSpider(CrawlSpider):
name = "euler"
allowed_domains = ["projecteuler.net"]
start_urls = [
"https://projecteuler.net/archives",
]
rules = (
Rule(LinkExtractor(allow=("/archives.+"),),follow=True),
Rule(LinkExtractor(allow=("/problem=.+", )),callback='parse_item',follow=True),
)
def parse_item(self, response):
item = EulerItem()
item['index'] = response.xpath("/html/head/title").extract()[0].split()[1]
item['name'] = response.xpath("//h2/text()").extract()[0]
cond = u""
for i in response.xpath('//div[@role="problem"]/*').extract():
cond += i
item['condition'] = cond
return item