From: Martin Taibr Date: Thu, 23 Mar 2017 09:29:11 +0000 (+0100) Subject: script for checking links X-Git-Url: https://git.rm.cloudns.org/?a=commitdiff_plain;h=cdc6056a22e6e865cce6880007014237fa38805f;p=xonotic%2Fxonotic.wiki.git script for checking links --- diff --git a/assets/check-and-fix.py b/assets/check-and-fix.py new file mode 100755 index 0000000..ffd4f1d --- /dev/null +++ b/assets/check-and-fix.py @@ -0,0 +1,305 @@ +#!/usr/bin/env python3 + +# Well, this wasn't supposed to be so long and complicated. +# Anyway, it makes sure the wiki works on both Gitlab and Github by moving +# stuff around and fixing links. Then it reports all remaining broken links +# and unused files. Since the wiki is in git, you can use `git status` +# and `git diff` to see the changes. You can also use the `--dry-run` flag +# to print all changes the script would make without actually making them. + +# See Editing.md for more information. + +# Some stuff that could have been done better: +# - Not parsing Markdown with regex. Currently, we for example report +# broken links even though they're inside code blocks (e.g. Irclog.md) +# - Using the type system (and mypy) to distinguish different link types +# to make sure the right functions are called with the right link types +# (e.g. page links, file links, links with headers, urls, ...) +# - Checking outbound links for 404s. + +import sys +import os +import glob +import regex # sudo pip3 install regex +import functools +from typing import * +from os.path import normpath, join, dirname, basename + + +# yeah, well, this is ugly but sure beats putting the regex on one line +def compile_regex(rgx: str): + # regex (unlike re) supports non-constant length look-behinds + return regex.compile( + "".join( + [line.strip() for line in rgx])) + + +# examples: +# [Page link](Some_Page) +# [Url link](http://example.com) +# ![Image](image_1.png) +# [![Image link to image](image_inner.png)](image_outer.png) +# [![Image link to page](image_inner.png)](Archive/Some_Page) + +# regex.sub doesnt support overlapping - we have to use lookbehinds. +# Practically, the inner link will never be a page so we don't need to +# sub it, but later we can reuse the regex to go through all the links +# and check that they're valid. +LINK_REGEX = compile_regex(""" +(?<= + \[ + (?: + [^\[\]]* + | + \!\[ + [^\[\]]* + \] + \( + [^()]* + \) + ) + \] +) +\( + ([^()]*) +\) +""") + + +dry_run = False + + +def strip_header_link(link: str) -> str: + "remove links to headers inside the file" + + header_index = link.rfind('#') + if header_index != -1: + link = link[:header_index] + return link + + +def convert_page_name(path: str) -> str: + "path can be with or without .md" + + if path.startswith("_"): + # ignore header, footer etc + return path + + if "-" in path: + # don't wanna break stuff like mapping-entity-func_door + return path + + headerless = strip_header_link(path) + # don't reformat these links because they're often linked to from outside + for exc in ["Repository_Access", "Halogenes_Newbie_Corner"]: + if headerless == exc or headerless == exc + ".md": + return path + + return basename(path).replace("_", "-") + + +def convert_page_link(link: str) -> str: + header_index = link.rfind('#') + if header_index != -1: + header = link[header_index + 1:] + if "_" in header: + print("warning: underscore in header: {}".format(link)) + return convert_page_name(link) + + +def find_paths() -> Tuple[List[str], List[str]]: + all_paths = sorted(filter( + os.path.isfile, + [name for name in glob.iglob('**', recursive=True)])) + md_paths = sorted(filter(lambda s: s.endswith(".md"), all_paths)) + return all_paths, md_paths + + +def fix_dir_structure(): + _, md_paths = find_paths() + for path in md_paths: + fixed = convert_page_name(path) + if fixed == path: + continue + + if os.path.exists(fixed): + print("warning: collision: {}".format(path)) + elif dry_run: + print("would rename {} to {}".format(path, fixed)) + else: + os.rename(path, fixed) + + +def is_between_files(link: str) -> bool: + if "://" in link or link.startswith("#"): + # http(s) link or link to header on the same page + return False + else: + return True + + +def is_page_link(link: str) -> bool: + # this is a best guess, i don't think there is a foolproof way to tell + + if link.startswith("assets") or link.startswith("img"): + # hopefully nobody adds more directories + return False + if "." in basename(link): + # hopefully it's an extension + return False + # files in root without extension will fail + + return True + + +def replace_link(changes: List[str], match) -> str: + text = match.group() + link_start = match.start(1) - match.start() + link_end = match.end(1) - match.start() + + link = text[link_start:link_end] + + if is_between_files(link) and is_page_link(link): + new_link = convert_page_link(link) + new_text = text[:link_start] + new_link + text[link_end:] + if text != new_text: + changes.append("\t{} -> {}".format(text, new_text)) + return new_text + else: + return text + + +def fix_links(): + _, md_paths = find_paths() + for path in md_paths: + with open(path, 'r+') as f: + contents = f.read() + + changes = [] + replacer = functools.partial(replace_link, changes) + contents_new = LINK_REGEX.sub(replacer, contents) + if dry_run and any(changes): + print("would convert these links in {}:".format(path)) + for change in changes: + print(change) + + if not dry_run and contents != contents_new: + f.seek(0) + f.write(contents_new) + f.truncate() + + +def link_to_path(current_file: str, link: str) -> str: + # nothing . .. / + # gitlab root current current root + # gollum current current current root + # github ok ok broken broken + + # when not using subdirs, nothing or "." works for all 3 + + if link.startswith("..") or link.startswith("/"): + print("file: {} bad link: {}", link) + + # path relative to wiki root, not curent file + current_dir = dirname(current_file) + link = normpath(join(current_dir, link)) + + link = strip_header_link(link) + + # page links don't have an extension - add it + extension_index = link.rfind('.') + if extension_index == -1: + link = link + '.md' + + return link + + +def get_file_links(path: str) -> Generator[str, None, None]: + with open(path, 'r') as f: + contents = f.read() + for match in LINK_REGEX.finditer(contents): + link = match.group(1) + + if is_between_files(link): + yield link + + +def canonicalize(path: str) -> str: + # spaces and capitalization don't seem to matter for pages + if path.endswith(".md"): + return path.replace(" ", "-").casefold() + else: + return path + + +def find_broken(all_paths: List[str], md_paths: List[str]): + canonical_paths = [canonicalize(path) for path in all_paths] + + for path in md_paths: + if path == "Irclog.md": + continue # TODO need to parse MD properly to avoid false posiives + for link in get_file_links(path): + link_target = canonicalize(link_to_path(path, link)) + if not link_target in canonical_paths: + #print("broken link in {}: {} -> {}".format(path, link, link_target)) + print("broken link in {}: {}".format(path, link)) + + +def walk_links(canonical_to_real: Dict[str, str], is_linked: Dict[str, bool], current_path: str): + canonical = canonicalize(current_path) + if canonical not in canonical_to_real: + # broken link - nothing to do here, we check broken links elsewhere + # because here we're not guaranteed to walk through all files + #print("not in known paths: {}".format(current_path)) + return + + current_path = canonical_to_real[canonical] + + if is_linked[current_path]: + return + + is_linked[current_path] = True + if current_path.endswith(".md"): + for link in get_file_links(current_path): + link_target = link_to_path(current_path, link) + walk_links(canonical_to_real, is_linked, link_target) + + +def find_unlinked(all_paths: List[str]): + canonical_to_real = {canonicalize(path): path for path in all_paths} + is_linked = {path: False for path in all_paths} + + # ignore these 2 - currently they don't show on GitLab but do on GitHub + is_linked["_Footer.md"] = True + is_linked["_Sidebar.md"] = True + + walk_links(canonical_to_real, is_linked, "Home.md") + + for path, linked in is_linked.items(): + if not linked: + print("not reachable from Home: {}".format(path)) + + +def check_links(): + all_paths, md_paths = find_paths() + find_broken(all_paths, md_paths) + find_unlinked(all_paths) + + +def main(): + global dry_run + if len(sys.argv) > 1 and sys.argv[1] == "--dry-run": + dry_run = True + + # convert file paths - put everything into root + fix_dir_structure() + + # convert links on all pages + fix_links() + + # look for broken links and unlinked files + check_links() + + +if __name__ == '__main__': + main() diff --git a/assets/convert-wiki-links.py b/assets/convert-wiki-links.py deleted file mode 100755 index 9479bae..0000000 --- a/assets/convert-wiki-links.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# not a simple sed script because I can do non-greedy easier here :x - -# changing gollum/wiki links to markdown/markup syntax (no redlinks support... fuuu gitlab) -# but not the other way around... -# since external links and images are better left using portable markdown syntax - -import os -import re - -FILES = ('.md',) -RX = [ -# (re.compile(r''), ''), - # I'm sure this could be cleaner... but it works (order is important (with \W for french chars), or [[a|b]] is matched to [a|b](a|b) !) - (re.compile(r'(?u)\[\[([\w\W \\/\.#\(\)_-]+?)\|([\w\W \\/\.#\(\)_-]+?)\]\]'), r'[\1](\2)'), # [[This|that#top]] -> [This](that#top) - (re.compile(r'(?u)\[\[([\w\W \\/\.#\(\)_-]+?)\]\]'), r'[\1](\1)'), # [[This]] -> [This](This) -] - -path = '.' -lsdir = os.listdir(path) -for f in lsdir: - file_name, file_extension = os.path.splitext(f) - new_f = file_name + file_extension + '.rx' - - if file_extension in FILES: - i = os.path.join(path, f) - o = os.path.join(path, new_f) - with open(i, "r") as inf, open(o, "w") as outf: - for line in inf: - for search, replace in RX: - #line = search.sub(replace, line) - line = re.sub(search, replace, line) - outf.write(line) - os.rename(o, i) diff --git a/assets/convert.sh b/assets/convert.sh deleted file mode 100755 index e7d4a55..0000000 --- a/assets/convert.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/sh -# Textile to Markdown in-place conversion -for file in `ls *.md`; do - [ -f ${file} ] && [ ! -h ${file} ] || continue - pandoc -f textile -t markdown_github ${file} > ${file}.bak - mv ${file}.bak ${file} -done diff --git a/assets/redmine-to-gitlab/convert-wiki-links.py b/assets/redmine-to-gitlab/convert-wiki-links.py new file mode 100755 index 0000000..9479bae --- /dev/null +++ b/assets/redmine-to-gitlab/convert-wiki-links.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# not a simple sed script because I can do non-greedy easier here :x + +# changing gollum/wiki links to markdown/markup syntax (no redlinks support... fuuu gitlab) +# but not the other way around... +# since external links and images are better left using portable markdown syntax + +import os +import re + +FILES = ('.md',) +RX = [ +# (re.compile(r''), ''), + # I'm sure this could be cleaner... but it works (order is important (with \W for french chars), or [[a|b]] is matched to [a|b](a|b) !) + (re.compile(r'(?u)\[\[([\w\W \\/\.#\(\)_-]+?)\|([\w\W \\/\.#\(\)_-]+?)\]\]'), r'[\1](\2)'), # [[This|that#top]] -> [This](that#top) + (re.compile(r'(?u)\[\[([\w\W \\/\.#\(\)_-]+?)\]\]'), r'[\1](\1)'), # [[This]] -> [This](This) +] + +path = '.' +lsdir = os.listdir(path) +for f in lsdir: + file_name, file_extension = os.path.splitext(f) + new_f = file_name + file_extension + '.rx' + + if file_extension in FILES: + i = os.path.join(path, f) + o = os.path.join(path, new_f) + with open(i, "r") as inf, open(o, "w") as outf: + for line in inf: + for search, replace in RX: + #line = search.sub(replace, line) + line = re.sub(search, replace, line) + outf.write(line) + os.rename(o, i) diff --git a/assets/redmine-to-gitlab/convert.sh b/assets/redmine-to-gitlab/convert.sh new file mode 100755 index 0000000..e7d4a55 --- /dev/null +++ b/assets/redmine-to-gitlab/convert.sh @@ -0,0 +1,7 @@ +#!/bin/sh +# Textile to Markdown in-place conversion +for file in `ls *.md`; do + [ -f ${file} ] && [ ! -h ${file} ] || continue + pandoc -f textile -t markdown_github ${file} > ${file}.bak + mv ${file}.bak ${file} +done diff --git a/assets/redmine-to-gitlab/rename.sh b/assets/redmine-to-gitlab/rename.sh new file mode 100755 index 0000000..03bd61e --- /dev/null +++ b/assets/redmine-to-gitlab/rename.sh @@ -0,0 +1,2 @@ +#!/bin/sh +rename 's/\.textile$/$1.md/' *.textile diff --git a/assets/rename.sh b/assets/rename.sh deleted file mode 100755 index 03bd61e..0000000 --- a/assets/rename.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/sh -rename 's/\.textile$/$1.md/' *.textile