First commit

This commit is contained in:
2015-04-22 18:24:28 +03:00
parent 3f579222f1
commit 0a76fce131
10 changed files with 10232 additions and 0 deletions

61
.gitignore vendored Normal file
View File

@@ -0,0 +1,61 @@
# Created by https://www.gitignore.io
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
# Translations
*.mo
*.pot
# Django stuff:
*.log
# Sphinx documentation
docs/_build/
# PyBuilder
target/

0
euler_parser/__init__.py Normal file
View File

8
euler_parser/items.py Normal file
View File

@@ -0,0 +1,8 @@
# -*- coding: utf-8 -*-
import scrapy
class EulerItem(scrapy.Item):
index = scrapy.Field()
name = scrapy.Field()
condition = scrapy.Field()

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
class EulerParserPipeline(object):
def process_item(self, item, spider):
return item

17
euler_parser/settings.py Normal file
View File

@@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
# Scrapy settings for euler_parser project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'euler_parser'
SPIDER_MODULES = ['euler_parser.spiders']
NEWSPIDER_MODULE = 'euler_parser.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'euler_parser (+http://www.yourdomain.com)'

View File

@@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View File

@@ -0,0 +1,29 @@
# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from euler_parser.items import EulerItem
class EulerSpider(CrawlSpider):
name = "euler"
allowed_domains = ["projecteuler.net"]
start_urls = [
"https://projecteuler.net/archives",
]
rules = (
Rule(LinkExtractor(allow=("/archives.+"),),follow=True),
Rule(LinkExtractor(allow=("/problem=.+", )),callback='parse_item',follow=True),
)
def parse_item(self, response):
item = EulerItem()
item['index'] = response.xpath("/html/head/title").extract()[0].split()[1]
item['name'] = response.xpath("//h2/text()").extract()[0]
cond = u""
for i in response.xpath('//div[@role="problem"]/*').extract():
cond += i
item['condition'] = cond
return item

5873
output/euler.csv Normal file

File diff suppressed because it is too large Load Diff

4223
output/euler.xml Normal file

File diff suppressed because one or more lines are too long

11
scrapy.cfg Normal file
View File

@@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# http://doc.scrapy.org/en/latest/topics/scrapyd.html
[settings]
default = euler_parser.settings
[deploy]
#url = http://localhost:6800/
project = euler_parser