You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

325 line
9.3KB

  1. //
  2. // MMHTMLParser.m
  3. // MMMarkdown
  4. //
  5. // Copyright (c) 2012 Matt Diephouse.
  6. //
  7. // Permission is hereby granted, free of charge, to any person obtaining a copy
  8. // of this software and associated documentation files (the "Software"), to deal
  9. // in the Software without restriction, including without limitation the rights
  10. // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11. // copies of the Software, and to permit persons to whom the Software is
  12. // furnished to do so, subject to the following conditions:
  13. //
  14. // The above copyright notice and this permission notice shall be included in
  15. // all copies or substantial portions of the Software.
  16. //
  17. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22. // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  23. // THE SOFTWARE.
  24. //
  25. #import "MMHTMLParser.h"
  26. #import "MMElement.h"
  27. #import "MMScanner.h"
  28. @implementation MMHTMLParser
  29. #pragma mark - Public Methods
  30. - (MMElement *)parseBlockTagWithScanner:(MMScanner *)scanner
  31. {
  32. [scanner beginTransaction];
  33. MMElement *element = [self _parseStrictBlockTagWithScanner:scanner];
  34. [scanner commitTransaction:element != nil];
  35. if (element)
  36. return element;
  37. return [self _parseLenientBlockTagWithScanner:scanner];
  38. }
  39. - (MMElement *)parseCommentWithScanner:(MMScanner *)scanner
  40. {
  41. if (![scanner matchString:@"<!--"])
  42. return nil;
  43. NSCharacterSet *setToSkip = [[NSCharacterSet characterSetWithCharactersInString:@"-"] invertedSet];
  44. while (!scanner.atEndOfString)
  45. {
  46. if (scanner.atEndOfLine)
  47. [scanner advanceToNextLine];
  48. else
  49. {
  50. [scanner skipCharactersFromSet:setToSkip];
  51. if ([scanner matchString:@"-->"])
  52. {
  53. MMElement *element = [MMElement new];
  54. element.type = MMElementTypeHTML;
  55. element.range = NSMakeRange(scanner.startLocation, scanner.location-scanner.startLocation);
  56. return element;
  57. }
  58. [scanner advance];
  59. }
  60. }
  61. return nil;
  62. }
  63. - (MMElement *)parseInlineTagWithScanner:(MMScanner *)scanner
  64. {
  65. if (scanner.nextCharacter != '<')
  66. return nil;
  67. [scanner advance];
  68. if (scanner.nextCharacter == '/')
  69. [scanner advance];
  70. NSRange tagNameRange = [self _parseNameWithScanner:scanner];
  71. if (tagNameRange.length == 0)
  72. return nil;
  73. [self _parseAttributesWithScanner:scanner];
  74. [scanner skipWhitespace];
  75. if (scanner.nextCharacter == '/')
  76. [scanner advance];
  77. if (scanner.nextCharacter != '>')
  78. return nil;
  79. [scanner advance];
  80. MMElement *element = [MMElement new];
  81. element.type = MMElementTypeHTML;
  82. element.range = NSMakeRange(scanner.startLocation, scanner.location-scanner.startLocation);
  83. element.stringValue = [scanner.string substringWithRange:tagNameRange];
  84. return element;
  85. }
  86. #pragma mark - Private Methods
  87. - (MMElement *)_parseStrictBlockTagWithScanner:(MMScanner *)scanner
  88. {
  89. // which starts with a '<'
  90. if (scanner.nextCharacter != '<')
  91. return nil;
  92. [scanner advance];
  93. NSSet *htmlBlockTags = [NSSet setWithObjects:
  94. @"p", @"div", @"h1", @"h2", @"h3", @"h4", @"h5", @"h6",
  95. @"blockquote", @"pre", @"table", @"dl", @"ol", @"ul",
  96. @"script", @"noscript", @"form", @"fieldset", @"iframe",
  97. @"math", @"ins", @"del", nil];
  98. NSString *tagName = [scanner nextWord];
  99. if (![htmlBlockTags containsObject:tagName])
  100. return nil;
  101. scanner.location += tagName.length;
  102. [self _parseAttributesWithScanner:scanner];
  103. [scanner skipWhitespace];
  104. if (scanner.nextCharacter != '>')
  105. return nil;
  106. [scanner advance];
  107. NSCharacterSet *boringChars = [[NSCharacterSet characterSetWithCharactersInString:@"<"] invertedSet];
  108. while (1)
  109. {
  110. if (scanner.atEndOfString)
  111. return nil;
  112. [scanner skipCharactersFromSet:boringChars];
  113. if (scanner.atEndOfLine)
  114. {
  115. [scanner advanceToNextLine];
  116. continue;
  117. }
  118. [scanner beginTransaction];
  119. if ([self _parseEndTag:tagName withScanner:scanner])
  120. {
  121. [scanner commitTransaction:YES];
  122. break;
  123. }
  124. [scanner commitTransaction:NO];
  125. MMElement *element;
  126. [scanner beginTransaction];
  127. element = [self _parseStrictBlockTagWithScanner:scanner];
  128. [scanner commitTransaction:element != nil];
  129. if (element)
  130. continue;
  131. [scanner beginTransaction];
  132. element = [self parseCommentWithScanner:scanner];
  133. [scanner commitTransaction:element != nil];
  134. if (element)
  135. continue;
  136. [scanner beginTransaction];
  137. element = [self parseInlineTagWithScanner:scanner];
  138. [scanner commitTransaction:element != nil];
  139. if (element)
  140. continue;
  141. return nil;
  142. }
  143. MMElement *element = [MMElement new];
  144. element.type = MMElementTypeHTML;
  145. element.range = NSMakeRange(scanner.startLocation, scanner.location-scanner.startLocation);
  146. return element;
  147. }
  148. - (BOOL)_parseEndTag:(NSString *)tagName withScanner:(MMScanner *)scanner
  149. {
  150. if (scanner.nextCharacter != '<')
  151. return NO;
  152. [scanner advance];
  153. if (scanner.nextCharacter != '/')
  154. return NO;
  155. [scanner advance];
  156. [scanner skipWhitespace];
  157. if (![scanner matchString:tagName])
  158. return NO;
  159. [scanner skipWhitespace];
  160. if (scanner.nextCharacter != '>')
  161. return NO;
  162. [scanner advance];
  163. return YES;
  164. }
  165. - (MMElement *)_parseLenientBlockTagWithScanner:(MMScanner *)scanner
  166. {
  167. // which starts with a '<'
  168. if (scanner.nextCharacter != '<')
  169. return nil;
  170. [scanner advance];
  171. NSSet *htmlBlockTags = [NSSet setWithObjects:
  172. @"p", @"div", @"h1", @"h2", @"h3", @"h4", @"h5", @"h6",
  173. @"blockquote", @"pre", @"table", @"dl", @"ol", @"ul",
  174. @"script", @"noscript", @"form", @"fieldset", @"iframe",
  175. @"math", @"ins", @"del", nil];
  176. NSString *tagName = scanner.nextWord;
  177. if (![htmlBlockTags containsObject:tagName])
  178. return nil;
  179. scanner.location += tagName.length;
  180. // Find a '>'
  181. while (scanner.nextCharacter != '>')
  182. {
  183. if (scanner.atEndOfString)
  184. return nil;
  185. else if (scanner.atEndOfLine)
  186. [scanner advanceToNextLine];
  187. else
  188. [scanner advance];
  189. }
  190. // Skip lines until we come across a blank line
  191. while (!scanner.atEndOfLine)
  192. {
  193. [scanner advanceToNextLine];
  194. }
  195. MMElement *element = [MMElement new];
  196. element.type = MMElementTypeHTML;
  197. element.range = NSMakeRange(scanner.startLocation, scanner.location-scanner.startLocation);
  198. return element;
  199. }
  200. - (NSRange)_parseNameWithScanner:(MMScanner *)scanner
  201. {
  202. NSMutableCharacterSet *nameSet = [NSMutableCharacterSet alphanumericCharacterSet];
  203. [nameSet addCharactersInString:@":-"];
  204. NSRange result = NSMakeRange(scanner.location, 0);
  205. result.length = [scanner skipCharactersFromSet:nameSet];
  206. return result;
  207. }
  208. - (BOOL)_parseStringWithScanner:(MMScanner *)scanner
  209. {
  210. unichar nextChar = [scanner nextCharacter];
  211. if (nextChar != '"' && nextChar != '\'')
  212. return NO;
  213. [scanner advance];
  214. while (scanner.nextCharacter != nextChar)
  215. {
  216. if (scanner.atEndOfString)
  217. return NO;
  218. else if (scanner.atEndOfLine)
  219. [scanner advanceToNextLine];
  220. else
  221. [scanner advance];
  222. }
  223. // skip over the closing quotation mark
  224. [scanner advance];
  225. return YES;
  226. }
  227. - (BOOL)_parseAttributeValueWithScanner:(MMScanner *)scanner
  228. {
  229. NSMutableCharacterSet *characters = [[NSCharacterSet.whitespaceCharacterSet invertedSet] mutableCopy];
  230. [characters removeCharactersInString:@"\"'=><`"];
  231. return [scanner skipCharactersFromSet:characters] > 0;
  232. }
  233. - (void)_parseAttributesWithScanner:(MMScanner *)scanner
  234. {
  235. while ([scanner skipWhitespaceAndNewlines] > 0)
  236. {
  237. NSRange range;
  238. range = [self _parseNameWithScanner:scanner];
  239. if (range.length == 0)
  240. break;
  241. [scanner beginTransaction];
  242. [scanner skipWhitespace];
  243. if (scanner.nextCharacter == '=')
  244. {
  245. [scanner commitTransaction:YES];
  246. [scanner advance];
  247. [scanner skipWhitespace];
  248. if ([self _parseStringWithScanner:scanner])
  249. ;
  250. else if ([self _parseAttributeValueWithScanner:scanner])
  251. ;
  252. else
  253. break;
  254. }
  255. else
  256. {
  257. [scanner commitTransaction:NO];
  258. }
  259. }
  260. }
  261. @end