123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130 |
- #!/usr/bin/env python
- from __future__ import print_function
- import os
- import sys
- import re
- SOURCE_ROOT = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
- DOCS_DIR = os.path.join(SOURCE_ROOT, 'docs')
- def main():
- os.chdir(SOURCE_ROOT)
- filepaths = []
- totalDirs = 0
- try:
- for root, dirs, files in os.walk(DOCS_DIR):
- totalDirs += len(dirs)
- for f in files:
- if f.endswith('.md'):
- filepaths.append(os.path.join(root, f))
- except KeyboardInterrupt:
- print('Keyboard interruption. Please try again.')
- return 0
- totalBrokenLinks = 0
- for path in filepaths:
- totalBrokenLinks += getBrokenLinks(path)
- print('Parsed through ' + str(len(filepaths)) +
- ' files within docs directory and its ' +
- str(totalDirs) + ' subdirectories.')
- print('Found ' + str(totalBrokenLinks) + ' broken relative links.')
- return totalBrokenLinks
- def getBrokenLinks(filepath):
- currentDir = os.path.dirname(filepath)
- brokenLinks = []
- try:
- f = open(filepath, 'r')
- lines = f.readlines()
- except KeyboardInterrupt:
- print('Keyboard interruption while parsing. Please try again.')
- finally:
- f.close()
- linkRegexLink = re.compile('\[(.*?)\]\((?P<link>(.*?))\)')
- referenceLinkRegex = re.compile(
- '^\s{0,3}\[.*?\]:\s*(?P<link>[^<\s]+|<[^<>\r\n]+>)'
- )
- links = []
- for line in lines:
- matchLinks = linkRegexLink.search(line)
- matchReferenceLinks = referenceLinkRegex.search(line)
- if matchLinks:
- relativeLink = matchLinks.group('link')
- if not str(relativeLink).startswith('http'):
- links.append(relativeLink)
- if matchReferenceLinks:
- referenceLink = matchReferenceLinks.group('link').strip('<>')
- if not str(referenceLink).startswith('http'):
- links.append(referenceLink)
- for link in links:
- sections = link.split('#')
- if len(sections) < 2:
- if not os.path.isfile(os.path.join(currentDir, link)):
- brokenLinks.append(link)
- elif str(link).startswith('#'):
- if not checkSections(sections, lines):
- brokenLinks.append(link)
- else:
- tempFile = os.path.join(currentDir, sections[0])
- if os.path.isfile(tempFile):
- try:
- newFile = open(tempFile, 'r')
- newLines = newFile.readlines()
- except KeyboardInterrupt:
- print('Keyboard interruption while parsing. Please try again.')
- finally:
- newFile.close()
- if not checkSections(sections, newLines):
- brokenLinks.append(link)
- else:
- brokenLinks.append(link)
- print_errors(filepath, brokenLinks)
- return len(brokenLinks)
- def checkSections(sections, lines):
- invalidCharsRegex = '[^A-Za-z0-9_ \-]'
- sectionHeader = sections[1]
- regexSectionTitle = re.compile('# (?P<header>.*)')
- for line in lines:
- matchHeader = regexSectionTitle.search(line)
- if matchHeader:
- # This does the following to slugify a header name:
- # * Replace whitespace with dashes
- # * Strip anything that's not alphanumeric or a dash
- # * Anything quoted with backticks (`) is an exception and will
- # not have underscores stripped
- matchHeader = str(matchHeader.group('header')).replace(' ', '-')
- matchHeader = ''.join(
- map(
- lambda match: re.sub(invalidCharsRegex, '', match[0])
- + re.sub(invalidCharsRegex + '|_', '', match[1]),
- re.findall('(`[^`]+`)|([^`]+)', matchHeader),
- )
- )
- if matchHeader.lower() == sectionHeader:
- return True
- return False
- def print_errors(filepath, brokenLink):
- if brokenLink:
- print("File Location: " + filepath)
- for link in brokenLink:
- print("\tBroken links: " + link)
- if __name__ == '__main__':
- sys.exit(main())
|