First commit
This commit is contained in:
61
.gitignore
vendored
Normal file
61
.gitignore
vendored
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
# Created by https://www.gitignore.io
|
||||||
|
|
||||||
|
### Python ###
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
env/
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*,cover
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
target/
|
||||||
|
|
||||||
0
euler_parser/__init__.py
Normal file
0
euler_parser/__init__.py
Normal file
8
euler_parser/items.py
Normal file
8
euler_parser/items.py
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
class EulerItem(scrapy.Item):
|
||||||
|
index = scrapy.Field()
|
||||||
|
name = scrapy.Field()
|
||||||
|
condition = scrapy.Field()
|
||||||
6
euler_parser/pipelines.py
Normal file
6
euler_parser/pipelines.py
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
|
class EulerParserPipeline(object):
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
return item
|
||||||
17
euler_parser/settings.py
Normal file
17
euler_parser/settings.py
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Scrapy settings for euler_parser project
|
||||||
|
#
|
||||||
|
# For simplicity, this file contains only the most important settings by
|
||||||
|
# default. All the other settings are documented here:
|
||||||
|
#
|
||||||
|
# http://doc.scrapy.org/en/latest/topics/settings.html
|
||||||
|
#
|
||||||
|
|
||||||
|
BOT_NAME = 'euler_parser'
|
||||||
|
|
||||||
|
SPIDER_MODULES = ['euler_parser.spiders']
|
||||||
|
NEWSPIDER_MODULE = 'euler_parser.spiders'
|
||||||
|
|
||||||
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
|
#USER_AGENT = 'euler_parser (+http://www.yourdomain.com)'
|
||||||
4
euler_parser/spiders/__init__.py
Normal file
4
euler_parser/spiders/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
# This package will contain the spiders of your Scrapy project
|
||||||
|
#
|
||||||
|
# Please refer to the documentation for information on how to create and manage
|
||||||
|
# your spiders.
|
||||||
29
euler_parser/spiders/euler_spider.py
Normal file
29
euler_parser/spiders/euler_spider.py
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||||
|
from scrapy.contrib.linkextractors import LinkExtractor
|
||||||
|
|
||||||
|
from euler_parser.items import EulerItem
|
||||||
|
|
||||||
|
class EulerSpider(CrawlSpider):
|
||||||
|
name = "euler"
|
||||||
|
allowed_domains = ["projecteuler.net"]
|
||||||
|
start_urls = [
|
||||||
|
"https://projecteuler.net/archives",
|
||||||
|
]
|
||||||
|
rules = (
|
||||||
|
Rule(LinkExtractor(allow=("/archives.+"),),follow=True),
|
||||||
|
Rule(LinkExtractor(allow=("/problem=.+", )),callback='parse_item',follow=True),
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse_item(self, response):
|
||||||
|
item = EulerItem()
|
||||||
|
item['index'] = response.xpath("/html/head/title").extract()[0].split()[1]
|
||||||
|
item['name'] = response.xpath("//h2/text()").extract()[0]
|
||||||
|
cond = u""
|
||||||
|
for i in response.xpath('//div[@role="problem"]/*').extract():
|
||||||
|
cond += i
|
||||||
|
item['condition'] = cond
|
||||||
|
return item
|
||||||
5873
output/euler.csv
Normal file
5873
output/euler.csv
Normal file
File diff suppressed because it is too large
Load Diff
4223
output/euler.xml
Normal file
4223
output/euler.xml
Normal file
File diff suppressed because one or more lines are too long
11
scrapy.cfg
Normal file
11
scrapy.cfg
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
# Automatically created by: scrapy startproject
|
||||||
|
#
|
||||||
|
# For more information about the [deploy] section see:
|
||||||
|
# http://doc.scrapy.org/en/latest/topics/scrapyd.html
|
||||||
|
|
||||||
|
[settings]
|
||||||
|
default = euler_parser.settings
|
||||||
|
|
||||||
|
[deploy]
|
||||||
|
#url = http://localhost:6800/
|
||||||
|
project = euler_parser
|
||||||
Reference in New Issue
Block a user