First commit

2015-04-22 18:24:28 +03:00
parent 3f579222f1
commit 0a76fce131
10 changed files with 10232 additions and 0 deletions
--- a/euler_parser/init.py
+++ b/euler_parser/init.py
--- a/euler_parser/items.py
+++ b/euler_parser/items.py
@@ -0,0 +1,8 @@
+# -*- coding: utf-8 -*-
+
+import scrapy
+
+class EulerItem(scrapy.Item):
+    index = scrapy.Field()
+    name = scrapy.Field()
+    condition = scrapy.Field()
--- a/euler_parser/pipelines.py
+++ b/euler_parser/pipelines.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+
+class EulerParserPipeline(object):
+    def process_item(self, item, spider):
+        return item
--- a/euler_parser/settings.py
+++ b/euler_parser/settings.py
@@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for euler_parser project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#
+
+BOT_NAME = 'euler_parser'
+
+SPIDER_MODULES = ['euler_parser.spiders']
+NEWSPIDER_MODULE = 'euler_parser.spiders'
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'euler_parser (+http://www.yourdomain.com)'
--- a/euler_parser/spiders/init.py
+++ b/euler_parser/spiders/init.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/euler_parser/spiders/euler_spider.py
+++ b/euler_parser/spiders/euler_spider.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+
+import scrapy
+
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from scrapy.contrib.linkextractors import LinkExtractor
+
+from euler_parser.items import EulerItem
+
+class EulerSpider(CrawlSpider):
+    name = "euler"
+    allowed_domains = ["projecteuler.net"]
+    start_urls = [
+        "https://projecteuler.net/archives",
+    ]
+    rules = (
+        Rule(LinkExtractor(allow=("/archives.+"),),follow=True),
+        Rule(LinkExtractor(allow=("/problem=.+", )),callback='parse_item',follow=True),
+    )
+
+    def parse_item(self, response):
+        item = EulerItem()
+        item['index'] = response.xpath("/html/head/title").extract()[0].split()[1]
+        item['name'] = response.xpath("//h2/text()").extract()[0]
+        cond = u""
+        for i in response.xpath('//div[@role="problem"]/*').extract():
+            cond += i
+        item['condition'] = cond
+        return item