First commit

2015-04-22 18:24:28 +03:00
parent 3f579222f1
commit 0a76fce131
10 changed files with 10232 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,61 @@
 # Created by https://www.gitignore.io
 ### Python ###
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 env/
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 *.egg-info/
 .installed.cfg
 *.egg
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *,cover
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 target/
--- a/euler_parser/init.py
+++ b/euler_parser/init.py
--- a/euler_parser/items.py
+++ b/euler_parser/items.py
@@ -0,0 +1,8 @@
 # -*- coding: utf-8 -*-
 import scrapy
 class EulerItem(scrapy.Item):
    index = scrapy.Field()
    name = scrapy.Field()
    condition = scrapy.Field()
--- a/euler_parser/pipelines.py
+++ b/euler_parser/pipelines.py
@@ -0,0 +1,6 @@
 # -*- coding: utf-8 -*-
 class EulerParserPipeline(object):
    def process_item(self, item, spider):
        return item
--- a/euler_parser/settings.py
+++ b/euler_parser/settings.py
@@ -0,0 +1,17 @@
 # -*- coding: utf-8 -*-
 # Scrapy settings for euler_parser project
 #
 # For simplicity, this file contains only the most important settings by
 # default. All the other settings are documented here:
 #
 #     http://doc.scrapy.org/en/latest/topics/settings.html
 #
 BOT_NAME = 'euler_parser'
 SPIDER_MODULES = ['euler_parser.spiders']
 NEWSPIDER_MODULE = 'euler_parser.spiders'
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = 'euler_parser (+http://www.yourdomain.com)'
--- a/euler_parser/spiders/init.py
+++ b/euler_parser/spiders/init.py
@@ -0,0 +1,4 @@
 # This package will contain the spiders of your Scrapy project
 #
 # Please refer to the documentation for information on how to create and manage
 # your spiders.
--- a/euler_parser/spiders/euler_spider.py
+++ b/euler_parser/spiders/euler_spider.py
@@ -0,0 +1,29 @@
 # -*- coding: utf-8 -*-
 import scrapy
 from scrapy.contrib.spiders import CrawlSpider, Rule
 from scrapy.contrib.linkextractors import LinkExtractor
 from euler_parser.items import EulerItem
 class EulerSpider(CrawlSpider):
    name = "euler"
    allowed_domains = ["projecteuler.net"]
    start_urls = [
        "https://projecteuler.net/archives",
    ]
    rules = (
        Rule(LinkExtractor(allow=("/archives.+"),),follow=True),
        Rule(LinkExtractor(allow=("/problem=.+", )),callback='parse_item',follow=True),
    )
    def parse_item(self, response):
        item = EulerItem()
        item['index'] = response.xpath("/html/head/title").extract()[0].split()[1]
        item['name'] = response.xpath("//h2/text()").extract()[0]
        cond = u""
        for i in response.xpath('//div[@role="problem"]/*').extract():
            cond += i
        item['condition'] = cond
        return item
--- a/output/euler.csv
+++ b/output/euler.csv
--- a/output/euler.xml
+++ b/output/euler.xml
--- a/scrapy.cfg
+++ b/scrapy.cfg
@@ -0,0 +1,11 @@
 # Automatically created by: scrapy startproject
 #
 # For more information about the [deploy] section see:
 # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 [settings]
 default = euler_parser.settings
 [deploy]
 #url = http://localhost:6800/
 project = euler_parser