check-relative-doc-links.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. #!/usr/bin/env python
  2. from __future__ import print_function
  3. import os
  4. import sys
  5. import re
  6. SOURCE_ROOT = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
  7. DOCS_DIR = os.path.join(SOURCE_ROOT, 'docs')
  8. def main():
  9. os.chdir(SOURCE_ROOT)
  10. filepaths = []
  11. totalDirs = 0
  12. try:
  13. for root, dirs, files in os.walk(DOCS_DIR):
  14. totalDirs += len(dirs)
  15. for f in files:
  16. if f.endswith('.md'):
  17. filepaths.append(os.path.join(root, f))
  18. except KeyboardInterrupt:
  19. print('Keyboard interruption. Please try again.')
  20. return
  21. totalBrokenLinks = 0
  22. for path in filepaths:
  23. totalBrokenLinks += getBrokenLinks(path)
  24. print('Parsed through ' + str(len(filepaths)) +
  25. ' files within docs directory and its ' +
  26. str(totalDirs) + ' subdirectories.')
  27. print('Found ' + str(totalBrokenLinks) + ' broken relative links.')
  28. return totalBrokenLinks
  29. def getBrokenLinks(filepath):
  30. currentDir = os.path.dirname(filepath)
  31. brokenLinks = []
  32. try:
  33. f = open(filepath, 'r')
  34. lines = f.readlines()
  35. except KeyboardInterrupt:
  36. print('Keyboard interruption while parsing. Please try again.')
  37. finally:
  38. f.close()
  39. linkRegexLink = re.compile('\[(.*?)\]\((?P<link>(.*?))\)')
  40. referenceLinkRegex = re.compile(
  41. '^\s{0,3}\[.*?\]:\s*(?P<link>[^<\s]+|<[^<>\r\n]+>)'
  42. )
  43. links = []
  44. for line in lines:
  45. matchLinks = linkRegexLink.search(line)
  46. matchReferenceLinks = referenceLinkRegex.search(line)
  47. if matchLinks:
  48. relativeLink = matchLinks.group('link')
  49. if not str(relativeLink).startswith('http'):
  50. links.append(relativeLink)
  51. if matchReferenceLinks:
  52. referenceLink = matchReferenceLinks.group('link').strip('<>')
  53. if not str(referenceLink).startswith('http'):
  54. links.append(referenceLink)
  55. for link in links:
  56. sections = link.split('#')
  57. if len(sections) < 2:
  58. if not os.path.isfile(os.path.join(currentDir, link)):
  59. brokenLinks.append(link)
  60. elif str(link).startswith('#'):
  61. if not checkSections(sections, lines):
  62. brokenLinks.append(link)
  63. else:
  64. tempFile = os.path.join(currentDir, sections[0])
  65. if os.path.isfile(tempFile):
  66. try:
  67. newFile = open(tempFile, 'r')
  68. newLines = newFile.readlines()
  69. except KeyboardInterrupt:
  70. print('Keyboard interruption while parsing. Please try again.')
  71. finally:
  72. newFile.close()
  73. if not checkSections(sections, newLines):
  74. brokenLinks.append(link)
  75. else:
  76. brokenLinks.append(link)
  77. print_errors(filepath, brokenLinks)
  78. return len(brokenLinks)
  79. def checkSections(sections, lines):
  80. invalidCharsRegex = '[^A-Za-z0-9_ \-]'
  81. sectionHeader = sections[1]
  82. regexSectionTitle = re.compile('# (?P<header>.*)')
  83. for line in lines:
  84. matchHeader = regexSectionTitle.search(line)
  85. if matchHeader:
  86. # This does the following to slugify a header name:
  87. # * Replace whitespace with dashes
  88. # * Strip anything that's not alphanumeric or a dash
  89. # * Anything quoted with backticks (`) is an exception and will
  90. # not have underscores stripped
  91. matchHeader = str(matchHeader.group('header')).replace(' ', '-')
  92. matchHeader = ''.join(
  93. map(
  94. lambda match: re.sub(invalidCharsRegex, '', match[0])
  95. + re.sub(invalidCharsRegex + '|_', '', match[1]),
  96. re.findall('(`[^`]+`)|([^`]+)', matchHeader),
  97. )
  98. )
  99. if matchHeader.lower() == sectionHeader:
  100. return True
  101. return False
  102. def print_errors(filepath, brokenLink):
  103. if brokenLink:
  104. print("File Location: " + filepath)
  105. for link in brokenLink:
  106. print("\tBroken links: " + link)
  107. if __name__ == '__main__':
  108. sys.exit(main())