--- /dev/null
+#!/usr/bin/env python2
+# -*- coding: utf-8 -*-
+#
+# Copyright: © 2014 "nyov"
+# License: Expat
+#
+# This script will crawl a Redmine wiki website and write all the history
+# of all pages found to a single branch inside a Git repository.
+#
+# The script will create a git repository in your working directory.
+# It requires the scrapy (0.24) and pygit2 python packages.
+# Aside from that it needs enough memory to hold all the records in
+# memory until it can sort them by date and version and flush the
+# git tree history in correct order to disk only at the very end.
+#
+# Created for importing from static html pages of a redmine wiki,
+# (so some workarounds exist, for missing pages, in how the crawl runs)
+# but should work on or easily be adaptable to the real thing.
+
+import scrapy
+from scrapy import log
+from scrapy.contrib.linkextractors import LinkExtractor
+from scrapy.http import Request, HtmlResponse
+from scrapy.selector import Selector
+
+import urlparse
+import urllib
+import re
+
+import datetime
+#from dateutil.parser import parse
+
+# for git imports
+import pygit2
+import heapq
+import calendar
+import time
+
+################
+### SETTINGS ###
+################
+
+BOT_NAME = 'RedmineExporter'
+BOT_VERSION = '1.0'
+# how to identify to the target website
+USER_AGENT = '%s/%s (+http://www.yourdomain.com)' % (BOT_NAME, BOT_VERSION)
+# how many parallel connections to keep open to the target website
+CONCURRENT_REQUESTS = 16
+
+# show duplicate (dropped) requests
+DUPEFILTER_DEBUG = False
+# for debugging log level see end of file
+################
+
+def read_git_authors(file):
+ """Read a git (git-svn) authors.txt file
+
+ which has the line format:
+ handle = Full Name <and@some.email>
+ """
+ authors = {}
+ try:
+ with open(file) as f:
+ data = f.readlines()
+ data = (l for l in data if not l.startswith('#'))
+ for line in data: # if not line.startswith('#'):
+ name, handle = line.strip().split(' = ')
+ author, email = handle.rstrip('>').split(' <')
+ authors[name] = (author, email)
+ #print('\t%s => "%s" [%s]' % (name, author, email))
+ except IOError: pass
+ return authors
+
+
+class RedmineUser(scrapy.Item):
+ author = scrapy.Field()
+ email = scrapy.Field()
+
+
+class RedminePage(scrapy.Item):
+ pagename = scrapy.Field()
+ version = scrapy.Field()
+ lastversion = scrapy.Field()
+ updated = scrapy.Field()
+ user = scrapy.Field()
+ comment = scrapy.Field()
+ content = scrapy.Field()
+ # debug
+ url = scrapy.Field()
+
+
+class RedmineExportSpider(scrapy.Spider):
+ """Xonotic Redmine exporter"""
+
+ name = BOT_NAME
+ allowed_domains = ['dev.xonotic.org']
+ start_urls = (
+ # wiki's 'Index by title' page
+ 'http://dev.xonotic.org/projects/xonotic/wiki/index.html',
+ # this page does not appear in the overview, wtf! I don't even...
+ # oh, it's been renamed
+ 'http://dev.xonotic.org/projects/xonotic/wiki/IRC.html',
+ )
+
+ def start_requests(self):
+ for link in self.start_urls[:1]: # index
+ yield Request(url=link, callback=self.parse_index)
+ for link in self.start_urls[1:]: # any other links
+ yield Request(url=link, callback=self.parse_pages)
+
+ def parse_index(self, response):
+ l = LinkExtractor(allow=(r'/wiki/.*\.html'), restrict_xpaths=('//div[@id="wrapper"]//div[@id="content"]'))
+ for link in l.extract_links(response):
+ yield Request(link.url, callback=self.parse_pages)
+
+ def parse_pages(self, response):
+ url, = response.xpath('//div[@id="wrapper"]//div[@id="content"]//a[contains(@class, "icon-history")]/@href').extract()[:1] or [None]
+ return Request(urlparse.urljoin(response.url, url), callback=self.parse_history_entry)
+
+ def parse_history_entry(self, response):
+ page = response.xpath('//div[@id="wrapper"]//div[@id="content"]')
+ paginated, = page.xpath('.//span[@class="pagination"]/a[contains(text(), "Next")]/@href').extract()[:1] or [None]
+ if paginated:
+ # re-entry, missing pages workaround
+ full, = page.xpath('.//span[@class="pagination"]/a[last()]/@href').extract()
+ return Request(urlparse.urljoin(response.url, full), callback=self.parse_history)
+ # missing recursion for more pages (200+ revisions)
+ else:
+ return self.parse_history(response)
+
+ def parse_history(self, response):
+ page = response.xpath('//div[@id="wrapper"]//div[@id="content"]')
+ history = page.xpath('.//form//table/tbody/tr')
+ pagename = re.match(r'.*/wiki/(.*)/history', response.url).group(1)
+ lastversion = page.xpath('.//form//table/tbody/tr[1]/td[1]/a/text()').extract()[0]
+ for row in history:
+ i = RedminePage()
+ i['pagename'] = pagename
+ i['version'], = row.xpath('td[@class="id"]/a/text()').extract()[:1] or [None]
+ i['version'] = int(i['version'])
+ i['lastversion'] = int(lastversion)
+ date, = row.xpath('td[@class="updated_on"]/text()').extract()
+ # date parse, assume UTC
+ #i['updated'] = parse(date)
+ i['updated'] = datetime.datetime.strptime(date, "%m/%d/%Y %I:%M %p")
+ i['user'], = row.xpath('td[@class="author"]/a[contains(@class, "user")]/text()').extract()[:1] or [None]
+ userpage, = row.xpath('td[@class="author"]/a[contains(@class, "user")]/@href').extract()[:1] or [None]
+ if userpage is not None:
+ yield Request(urlparse.urljoin(response.url, userpage), callback=self.parse_user)
+ i['comment'], = row.xpath('td[@class="comments"]/text()').extract()[:1] or [None]
+ content, = row.xpath('td[@class="buttons"]//a[contains(@href, "annotate.html")]/@href').extract()[:1] or [None]
+ request = Request(urlparse.urljoin(response.url, content), callback=self.parse_page)
+ request.meta['item'] = i
+ yield request
+
+ def parse_user(self, response):
+ i = RedmineUser()
+ user = response.xpath('//div[@id="wrapper"]//div[@id="content"]')
+ i['author'], = user.xpath('h2/text()').extract()[:1] or [None]
+ i['author'] = i['author'].strip()
+ #i['email'], = user.xpath('div[@class="splitcontentleft"]/ul[1]/li/a[contains(@href, "mailto")]/text()').extract()[:1] or [None]
+ i['email'], = user.xpath('div[@class="splitcontentleft"]/ul[1]/li/script/text()').re(r'.*\'(.*)\'')[:1] or [None]
+ if not i['email']:
+ i['email'] = '%s@' % i['author']
+ else:
+ email = urllib.unquote(i['email']).lstrip('document.write(\'').rstrip('\');').decode('string_escape').replace('\\/', '/')
+ fake = Selector(HtmlResponse(response.url, encoding='utf-8', body=email))
+ i['email'], = fake.xpath('//a/text()').extract()[:1] or [None]
+ return i
+
+ def parse_page(self, response):
+ i = response.meta['item']
+ page = response.xpath('//div[@id="wrapper"]//div[@id="content"]')
+ lines = page.xpath('table[contains(@class, "filecontent")]//tr/td[@class="line-code"]') # keep empty lines!
+ i['url'] = response.url
+ i['content'] = ''
+ for line in lines:
+ line = (line.xpath('pre/text()').extract() or [u''])[0]
+ i['content'] += line + '\n'
+
+ return i
+
+
+
+class GitImportPipeline(object):
+ """Git dumper"""
+
+ def __init__(self, *a, **kw):
+ self.repo = pygit2.init_repository('wiki.git', False) # non-bare repo
+ self.heap = [] # heap for sorting commits
+ self.committer = pygit2.Signature('RedmineExport', 'redmineexport@dev.xonotic.org', encoding='utf-8')
+ self.users = {}
+
+ def open_spider(self, spider):
+ self.users = read_git_authors("redmine-authors.txt")
+
+ def close_spider(self, spider):
+ self.write_git(spider)
+
+ def process_item(self, i, spider):
+ if isinstance(i, RedmineUser):
+ # prefer pre-loaded identities from local file
+ if i['author'] not in self.users:
+ self.users[i['author']] = (i['author'], i['email'])
+ log.msg("Scraped user %s" % (i['author'],), spider=spider, level=log.INFO)
+
+ if isinstance(i, RedminePage):
+ oid = self.repo.create_blob(i['content'].encode("utf8"))
+ ts = calendar.timegm(i['updated'].utctimetuple()) # datetime to unix timestamp for sorting
+ heapq.heappush(self.heap, (ts, i['version'], oid, i))
+ log.msg('Scraped page "%s" @ %s' % (i['pagename'], i['version']), spider=spider, level=log.INFO)
+
+ return i
+
+ def write_git(self, spider):
+ parent = parent_id = None
+ for _ in range(len(self.heap)):
+ (ts, vsn, oid, i) = heapq.heappop(self.heap)
+
+ commit_comment = i['comment'] or u''
+ add_comment = u'\n\n(Commit created by redmine exporter script from page "%s" version %s)' % (i['pagename'], i['version'])
+
+ if parent:
+ tb = self.repo.TreeBuilder(parent.tree) # treeish ~= filesystem folder
+ else:
+ tb = self.repo.TreeBuilder()
+
+ filename = '%s%s' % (i['pagename'], '.textile')
+
+ tb.insert(filename, oid, pygit2.GIT_FILEMODE_BLOB)
+ tree = tb.write() # create updated treeish with current page blob added
+
+ parents = []
+ if parent is not None:
+ parents = [parent_id]
+
+ (user, email) = self.users[i['user']]
+ author = pygit2.Signature(user, email, time=ts, offset=0, encoding='utf-8')
+
+ log.msg("Committing %s @ %s (%s)" % (i['pagename'], i['version'], oid), spider=spider, level=log.INFO)
+ cid = self.repo.create_commit(
+ 'refs/heads/master',
+ author, self.committer, commit_comment + add_comment, tree, parents, 'utf-8'
+ )
+ # commit is new parent for next commit
+ parent = self.repo.get(cid)
+ parent_id = cid
+
+
+ITEM_PIPELINES = { # HAXX :D
+ GitImportPipeline: 800,
+}
+
+# haxx: sad monkeypatch, might break
+from importlib import import_module
+def load_object(path):
+ try:
+ dot = path.rindex('.')
+ except ValueError:
+ raise ValueError("Error loading object '%s': not a full path" % path)
+ except AttributeError:
+ return path # hax
+
+ module, name = path[:dot], path[dot+1:]
+ mod = import_module(module)
+
+ try:
+ obj = getattr(mod, name)
+ except AttributeError:
+ raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name))
+
+ return obj
+
+scrapy.utils.misc.load_object = load_object
+# end haxx
+
+from scrapy.exceptions import DontCloseSpider
+def finished_run():
+ log.msg("""
+┌───────────────────────────────────────┐
+│ finished run │
+│ │
+│ VERIFY IT REALLY FOUND ALL YOUR PAGES │
+│ OR YOU WILL BE SORRY LATER │
+│ │
+│ if it was successful, you now want to │
+│ repack the dumped git object database:│
+│ │
+│ $ git reflog expire --expire=now --all│
+│ $ git gc --prune=now │
+│ $ git repack -A -d │
+│ $ git gc --aggressive --prune=now │
+└───────────────────────────────────────┘
+ """, spider=spider, level=log.INFO)
+
+
+if __name__ == "__main__":
+ # for scrapy 0.24
+ from twisted.internet import reactor
+ from scrapy.utils.project import get_project_settings
+ from scrapy.crawler import Crawler
+ from scrapy import log, signals
+
+ import sys
+
+ print("""
+ ┌───────────────────────────────────────┐
+ │ Redmine Exporter script │
+ ├───────────────────────────────────────┤
+ │ handle with care, │
+ │ don't kill your webserver, │
+ │ ...enjoy │
+ └───────────────────────────────────────┘
+ """)
+ raw_input("Hit Enter to continue...")
+
+ spider = RedmineExportSpider()
+ settings = get_project_settings()
+ settings.set('BOT_NAME', BOT_NAME, priority='cmdline')
+ settings.set('USER_AGENT', USER_AGENT, priority='cmdline')
+ settings.set('ITEM_PIPELINES', ITEM_PIPELINES, priority='cmdline')
+ settings.set('CONCURRENT_REQUESTS', CONCURRENT_REQUESTS, priority='cmdline')
+ settings.set('DUPEFILTER_DEBUG', DUPEFILTER_DEBUG, priority='cmdline')
+ crawler = Crawler(settings)
+ crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
+ crawler.signals.connect(finished_run, signal=signals.spider_closed)
+ crawler.configure()
+ crawler.crawl(spider)
+ crawler.start()
+# log.start(loglevel=log.DEBUG)
+ log.start(loglevel=log.INFO)
+ log.msg("Starting run ...", spider=spider, level=log.INFO)
+ reactor.run()