markdown 用Swift 2.0编写的markdown-ish解析器
Markdown-ish parser. Regular expressions are not allowed in here!
The parser works in several stages:
1. split up the text into lines
2. combine lines into logical blocks
3. tokenize the contents of the blocks
The result of this process is a tree structure that describes the contents of
the file. This tree can then be rendered to HTML, for example.
import Foundation
// MARK: - String Methods
extension String {
func replace(s1: String, with s2: String) -> String {
return self.stringByReplacingOccurrencesOfString(s1, withString: s2)
* HTML escapes a string.
func escape() -> String {
var s = self
s = s.replace("&", with: "&")
s = s.replace("\"", with: """)
s = s.replace("'", with: "'")
s = s.replace("<", with: "<")
s = s.replace(">", with: ">")
return s
// MARK: - Types
struct Markdown {
* The text from a Fragment is converted to a stream of Tokens. This allows
* for the separation of actual text from the control characters that modify
* the appearance of the text.
enum Token {
case Spacing // any amount of whitespace
case Text(text: String) // words
case Escape(symbol: String) // --, <, >, ', & and so on
case Open(symbol: String) // ", *, **, ~~, ` spans
case Close(symbol: String)
case Link(fragment: Fragment, url: String)
case Image(caption: String, url: String)
* Roughly speaking, each line in the input document corresponds to a fragment,
* but the fragment will have any whitespace trimmed off. The last newline is
* also stripped. If the line starts a new block, such as `- list item` then
* the `-` is also stripped from the fragment.
* Note: Source code is not tokenized at the moment, but could be in the future
* to support syntax highlighting. (Note: .Code blocks consist of only a single
* fragment, so such fragments may include newlines.)
enum Fragment {
case Text(tokens: [Token])
case Code(code: String)
* The possible types of top-level blocks.
enum BlockType {
case Empty // just for parsing
case Header(level: Int) // #
case Text // regular paragraph of text
case Quote // >
case Code(language: String) // ```language
case CodeIndented // 4 spaces or tab
case ListItem(ordered: Bool) // 1. or -
* Describes a top-level block. The lines from the input document are grouped
* into such blocks. Each block will have one or more Fragments.
struct Block {
var type: BlockType
var fragments: [Fragment] = []
init(type: BlockType) {
self.type = type
private let input: String // the Markdown text
private var blocks: [Block] = [] // the top-level blocks
// MARK: - Public API
extension Markdown {
init(string input: String) {
self.input = input
blocks = parseBlocks(parseLines())
func tree() -> [Block] {
return blocks
mutating func removeBlockAtIndex(index: Int) {
// MARK: - Workarounds for Swift Issues
* For debugging only. Letting Swift do this automatically doesn't work
* very dependably yet (Xcode 7 beta 5).
extension Markdown.Token: CustomStringConvertible {
var description: String {
switch self {
case .Spacing: return "Spacing"
case .Text(let text): return text
case .Escape(let symbol): return "Escape(\(symbol))"
case .Open(let symbol): return "Open(\(symbol))"
case .Close(let symbol): return "Close(\(symbol))"
case .Link(let fragment, let url): return "Link(\(fragment),\(url))"
case .Image(let caption, let url): return "Image(\(caption),\(url))"
extension Markdown.BlockType: CustomStringConvertible {
var description: String {
switch self {
case .Empty: return "@Empty"
case .Header(let level): return "@Header:\(level)"
case .Text: return "@Text"
case .Code(let language): return "@Code:\(language)"
case .CodeIndented: return "@CodeIndented"
case .Quote: return "@Quote"
case .ListItem(let ordered): return "@ListItem:\(ordered)"
* These helper methods are necessary because `if case` cannot be combined with
* other conditions.
* In Swift 2.0 you can't write: if case .Quote = foo && bar { ... }
* or: if !case .Empty
* or: if case .Empty || case .Header || case .Code
* But at least you can now write `if foo.isQuote() && bar { ... }`
private extension Markdown.BlockType {
func isEmpty() -> Bool {
if case .Empty = self { return true } else { return false }
func isCode() -> Bool {
if case .Code = self { return true } else { return false }
func isQuote() -> Bool {
if case .Quote = self { return true } else { return false }
func isListItem() -> Bool {
if case .ListItem = self { return true } else { return false }
func isOrderedList() -> Bool {
if case ListItem(let ordered) = self { return ordered } else { return false }
func isEmpty_Header_Code() -> Bool {
switch self {
case .Empty, .Header, .Code: return true
default: return false
func isText_CodeIndented() -> Bool {
switch self {
case .Text, .CodeIndented: return true
default: return false
func shouldTrimWhitespace() -> Bool {
switch self {
case .Code, .CodeIndented: return false
default: return true
// MARK: - Supporting Methods
private extension Character {
func isWhitespace() -> Bool {
return self == " " || self == "\t" || self == "\r"
func isWhitespaceOrNewline() -> Bool {
return isWhitespace() || self == "\n"
private extension Markdown {
func eatLeadingWhitespace(startIndex: String.Index, _ endIndex: String.Index) -> String.Index {
for var i = startIndex; i < endIndex; i = i.successor() {
if !input[i].isWhitespaceOrNewline() { return i }
return endIndex
func eatTrailingWhitespace(startIndex: String.Index, _ endIndex: String.Index) -> String.Index {
for var i = endIndex.predecessor(); i >= startIndex; i = i.predecessor() {
if !input[i].isWhitespaceOrNewline() { return i.successor() }
return startIndex
// MARK: - Splitting Into Lines
private extension Markdown {
* Determine the indices in the input document at which new lines begin.
func parseLines() -> [String.Index] {
var lines: [String.Index] = []
var i = input.startIndex
while i < input.endIndex {
let c = input[i]
i = i.successor()
// For convenience, the end of the string is recorded twice so the block
// scanning logic doesn't need a separate check for end-of-text. When the
// end is reached, it simply sees one final .Empty block.
// It may not be immediately obvious, but the check for endIndex here
// makes sure this happens whether the text ends with a newline or not.
if c == "\n" || i == input.endIndex {
lines.append(i) // add the endIndex again
return lines
// MARK: - Top-Level Blocks
private extension Markdown {
* Determine the top-level blocks in the file.
func parseBlocks(lines: [String.Index]) -> [Block] {
var blocks: [Block] = []
var lineStart = input.startIndex
var fragmentStart = lineStart
var fragmentEnd = lineStart
var block = Block(type: .Empty)
var count = 0
for lineEnd in lines {
// This looks at the next line. If the type of this line is different from
// the current block, then we may need to end the block and make a new one.
// Exactly how depends on the particular block type. The "adjusted" start
// index is for skipping the symbol that identifies the line.
var (nextType, adjustedLineStart) = identify(lineStart, lineEnd)
// A fragment should not have leading or trailing whitespace or a newline.
if nextType.shouldTrimWhitespace() {
adjustedLineStart = eatLeadingWhitespace(adjustedLineStart, lineEnd)
let adjustedLineEnd = eatTrailingWhitespace(adjustedLineStart, lineEnd)
//print("LINE '" + input.substringWithRange(lineStart ..< lineEnd) + "' is type \(nextType)")
// Because the endIndex appears twice in the array, we can easily detect
// whether scanning has reached the end of the input. (By the way, the
// type of that last "line" is .Empty, so when checking for the Empty state
// we don't need to look at endOfInput also.)
let endOfInput = (adjustedLineStart == input.endIndex)
// It may be a bit weird to define inner functions here, but this allows
// them to use variables such as `nextType` and `block`, without passing
// those as parameters.
func beginNewBlock() {
block = Block(type: nextType)
fragmentStart = adjustedLineStart
count = 0
func addTextFragment() {
let tokenized = tokenize(startIndex: fragmentStart, endIndex: fragmentEnd)
let fragment = Fragment.Text(tokens: tokenized)
func addCodeFragment() {
let s = input.substringWithRange(fragmentStart ..< fragmentEnd)
let fragment = Fragment.Code(code: s)
func finishBlock() {
// The current fragment always refers to the previous line(s). It is given
// by `fragmentStart` and `lineStart`, which is the end of the previous line
// and also the start of this one. So we don't immediately add new fragments,
// we always want to look at the next line first.
switch block.type {
case .Empty:
if !nextType.isEmpty() {
case .Header:
// A header is always just one line, so we can immediately add this block.
case .Text:
// A text block ends when the next line is empty, a header, or code.
if nextType.isEmpty_Header_Code() {
} else {
fragmentStart = lineStart
case .Quote:
// Each line in a quote is added as a new fragment, and we strip off
// the leading > character.
// A quote ends when the next line is empty, a header, or code.
if nextType.isEmpty_Header_Code() {
} else {
// Any other type of line also gets added to the quote. If it
// starts with >, we strip that off.
if case .Quote = nextType {
fragmentStart = adjustedLineStart
} else {
fragmentStart = lineStart
case .Code:
// A code block ends after a closing line of ``` backticks.
if nextType.isCode() || endOfInput {
block = Block(type: .Empty)
case .CodeIndented:
// Each line in an indented code block is added as a new fragment,
// allowing us to strip off the leading spaces/tabs.
if count == 0 {
// If the next line is empty, keep going. If the empty line(s) is/are
// followed by more code, then we'll insert empty fragments.
if nextType.isEmpty() && !endOfInput {
fragmentStart = lineEnd.predecessor()
} else if case .CodeIndented = nextType {
// If the next line is also an indented code block, then keep going.
// If we've seen empty lines, then add an empty fragment for each line.
if count > 0 {
fragmentEnd = fragmentStart
for _ in 1...count { addCodeFragment() }
count = 0
fragmentStart = adjustedLineStart
} else {
// If the next line is any other kind of block, then the indented
// code block has ended.
case .ListItem:
// Each line in a list item is added as a new fragment, and we strip
// off the leading - character.
// If the next line is text or indented code, then interpret this as
// another fragment that also belongs to this list item.
if nextType.isText_CodeIndented() {
fragmentStart = adjustedLineStart
} else {
lineStart = lineEnd
fragmentEnd = adjustedLineEnd
return blocks
// MARK: - Line Identification
private extension Markdown {
typealias LineType = (BlockType, String.Index)
* Scans the beginning of the line in order to identify what sort of line
* this is.
* Returns a new String.Index that points at the beginning of the actual text,
* having skipped the identifying characters (#, .1, -) but not necessarily
* any whitespace.
func identify(startIndex: String.Index, _ endIndex: String.Index) -> LineType {
var i = startIndex
func header() -> LineType {
let textStart = i
var count = 1
i = i.successor()
for ; i < endIndex; i = i.successor() {
if input[i] == "#" {
} else {
let headerStart = i
i = i.successor()
for ; i < endIndex; i = i.successor() {
if !input[i].isWhitespaceOrNewline() {
return (.Header(level: count), headerStart)
return (.Text, textStart) // a # by itself
func code() -> LineType {
let textStart = i
i = i.successor()
if i < endIndex && input[i] == "`" {
i = i.successor()
if i < endIndex && input[i] == "`" {
i = i.successor()
let j = endIndex.predecessor() // not true for very last line if no newline
if i < j {
return (.Code(language: input.substringWithRange(i ..< j)), endIndex)
} else {
return (.Code(language: ""), endIndex)
return (.Text, textStart)
func unorderedListItem() -> LineType {
let textStart = i
i = i.successor()
if i < endIndex && input[i].isWhitespace() {
return (.ListItem(ordered: false), i)
return (.Text, textStart)
func orderedListItem() -> LineType {
let textStart = i
i = i.successor()
if i < endIndex && input[i] == "." {
i = i.successor()
if i < endIndex && input[i].isWhitespace() {
return (.ListItem(ordered: true), i.successor())
return (.Text, textStart)
func identifier() -> LineType {
switch input[i] {
case "#":
return header()
case ">":
return (.Quote, i.successor())
case "`":
return code()
case "-":
return unorderedListItem()
case "1", "2", "3", "4", "5", "6", "7", "8", "9":
return orderedListItem()
return (.Text, i)
func whitespace() -> LineType {
var count = 0
var codeStart = i
for ; i < endIndex; i = i.successor() {
switch input[i] {
case "\n":
case " ", "\r":
count += 1
case "\t":
count += 4
if count >= 4 {
return (.CodeIndented, codeStart)
} else {
return identifier()
if count == 4 {
codeStart = i.successor()
return (.Empty, endIndex)
if i == endIndex || input[i] == "\n" {
return (.Empty, endIndex)
} else {
switch input[i] {
case " ", "\t", "\r":
return whitespace()
return identifier()
// MARK: - Tokenization of Fragments
private extension Markdown {
* The tokenizer takes in a text fragment and outputs a stream of `Token` objects.
* For example, the input:
* aaa **x**
* I've
* becomes the following stream of tokens: Text(aaa) Spacing Open(bold) Text(x)
* Close(Bold) Newline Text(I) Escape(') Text(ve)
* Multiple spaces get combined into a single token.
func tokenize(startIndex startIndex: String.Index, endIndex: String.Index) -> [Token] {
var tokens = [Token]()
var i = startIndex // the lookahead character
func addTextToken(s: String) {
tokens.append(Token.Text(text: s))
func addEscapeToken(s: String) {
tokens.append(Token.Escape(symbol: s))
func spacing() {
for ; i < endIndex; i = i.successor() {
if !input[i].isWhitespace() { break }
func escapeDash() {
if i < endIndex && input[i] == "-" {
i = i.successor()
} else {
func escapeEllipsis() {
if i < endIndex && input[i] == "." {
i = i.successor()
if i < endIndex && input[i] == "." {
i = i.successor()
} else {
} else {
func escape() {
let c = input[i]
i = i.successor()
if c == "-" {
} else if c == "." {
} else {
var seenOpen = [String: Bool]()
func addOpenOrCloseToken(symbol: String, strict: Bool = false) {
// The official rules are that *word* and *word1 word2* will work but
// not * word *, *word *, or * word*. To keep things simple, we only
// require that the opening * is followed by non-whitespace; where the
// closing * is doesn't matter.
if let open = seenOpen[symbol] where open {
seenOpen[symbol] = false
tokens.append(Token.Close(symbol: symbol))
} else if strict && (i == endIndex || input[i].isWhitespace()) {
} else {
seenOpen[symbol] = true
tokens.append(Token.Open(symbol: symbol))
func strikethrough() {
if i < endIndex && input[i] == "~" {
i = i.successor()
} else {
// Note that italics, bold, etc, do no work across multiple lines.
// That happens because we tokenize each fragment individually and we reset
// the tokenization state with each new fragment.
func italicsOrBold() {
if i < endIndex && input[i] == "*" {
i = i.successor()
addOpenOrCloseToken("**", strict: true)
} else {
addOpenOrCloseToken("*", strict: true)
func span() {
let c = input[i]
i = i.successor()
if c == "*" {
} else if c == "~" {
} else {
func backslash() {
i = i.successor()
if i < endIndex {
let c = input[i]
if c == "*" || c == "[" || c == "]" {
i = i.successor()
func parseLink() -> ((String.Index, String.Index), (String.Index, String.Index))? {
let textStart = i.successor()
// Loop until we find ] followed by ( followed by ). It's not a real link
// unless it has text and a valid URL, but we're not that picky.
// To be honest, this is where a regexp is the simpler solution. ;-)
for ; i < endIndex; i = i.successor() {
if input[i] == "]" {
let textEnd = i
i = i.successor()
if i < endIndex && input[i] == "(" {
i = i.successor()
let urlStart = i
for ; i < endIndex; i = i.successor() {
if input[i] == ")" {
let urlEnd = i
i = i.successor()
return ((eatLeadingWhitespace(textStart, textEnd),
eatTrailingWhitespace(textStart, textEnd)),
(eatLeadingWhitespace(urlStart, urlEnd),
eatTrailingWhitespace(urlStart, urlEnd)))
// This does not appear to be a validly formatted link.
i = textStart
return nil
func link() {
if let ((textStart, textEnd), (urlStart, urlEnd)) = parseLink() {
let tokenized = tokenize(startIndex: textStart, endIndex: textEnd)
let fragment = Fragment.Text(tokens: tokenized)
let url = input.substringWithRange(urlStart ..< urlEnd)
let token = Token.Link(fragment: fragment, url: url)
func image() {
i = i.successor()
if i < endIndex && input[i] == "[" {
if let ((captionStart, captionEnd), (urlStart, urlEnd)) = parseLink() {
let caption = input.substringWithRange(captionStart ..< captionEnd)
let url = input.substringWithRange(urlStart ..< urlEnd)
let token = Token.Image(caption: caption, url: url)
} else {
func endsWord(c: Character) -> Bool {
return c == " " || c == "\t" || c == "\n" || c == "\r" || // whitespace
c == "'" || c == "<" || c == ">" || c == "&" || c == "-" || c == "." || // escapes
c == "`" || c == "\"" || c == "*" || c == "~" || // span
c == "\\" || c == "[" || c == "!"
func word() {
let wordStart = i
for ; i < endIndex; i = i.successor() {
if endsWord(input[i]) { break }
addTextToken(input.substringWithRange(wordStart ..< i))
while i < endIndex {
switch input[i] {
case " ", "\t", "\r":
case "'", "<", ">", "&", "-", ".":
case "`", "\"", "*", "~":
case "\\":
case "[":
case "!":
case "\n":
fatalError("fragments should not contain newlines")
return tokens
// MARK: - HTML Rendering
private let escapeTable = [
"'": "’",
"<": "<",
">": ">",
"&": "&",
"-": "–",
"--": "—",
"...": "…",
private let openTable = [
"`": "<code>",
"\"": "“",
"*": "<em>",
"**": "<strong>",
"~~": "<del>",
private let closeTable = [
"`": "</code>",
"\"": "”",
"*": "</em>",
"**": "</strong>",
"~~": "</del>",
private extension Markdown.Token {
func toHTML() -> String {
switch self {
case .Spacing: return " "
case .Text(let text): return text
case .Escape(let symbol): return escapeTable[symbol]!
case .Open(let symbol): return openTable[symbol]!
case .Close(let symbol): return closeTable[symbol]!
case .Link(let fragment, let url):
return "<a href=\"\(url.escape())\">" + fragment.toHTML() + "</a>"
case .Image(let caption, let url):
return "<img src=\"\(url.escape())\" alt=\"\(caption.escape())\">"
private extension Markdown.Fragment {
func toHTML() -> String {
switch self {
case .Text(let tokens):
var s = ""
for token in tokens {
s += token.toHTML()
return s
case .Code(let string):
return string.escape()
// If the text block contains only one image token, then we turn it into
// a <figure> instead of a <p> paragraph.
func isImageOnly() -> Bool {
if case .Text(let tokens) = self where tokens.count == 1, case .Image = tokens[0] {
return true
} else {
return false
private extension Markdown.Block {
func isImageOnly() -> Bool {
return fragments.count == 1 && fragments.first!.isImageOnly()
func formatFragments(separator: String = "<br>\n") -> String {
precondition(fragments.count > 0)
if fragments.count == 1 {
return fragments.first!.toHTML()
} else {
return separator.join( { $0.toHTML() })
func toHTML() -> String {
switch type {
case .Empty:
fatalError("should not happen")
case .Header(let level):
return "<h\(level)>" + formatFragments() + "</h\(level)>\n\n"
case .Text:
if isImageOnly() {
return "<figure class=\"image\">" + formatFragments() + "</figure>\n\n"
} else {
return "<p>" + formatFragments("\n") + "</p>\n\n"
case .Quote:
return "<p>" + formatFragments() + "</p>\n"
case .Code(let language):
var s = "<figure class=\"code\"><pre><code"
if !language.isEmpty {
s += " class=\"\(language.escape())\""
s += ">" + formatFragments() + "</code></pre></figure>\n\n"
return s
case .CodeIndented:
return "<figure class=\"code\"><pre><code>" + formatFragments("\n") + "</code></pre></figure>\n\n"
case .ListItem:
return "<li>" + formatFragments() + "</li>\n"
extension Markdown {
func renderHTML() -> String {
var s = ""
var lastType = BlockType.Empty
var wasOrderedList = false
for block in blocks {
if !lastType.isQuote() && block.type.isQuote() {
s += "<blockquote>\n"
} else if lastType.isQuote() && !block.type.isQuote() {
s += "</blockquote>\n\n"
if !lastType.isListItem() && block.type.isListItem() {
wasOrderedList = block.type.isOrderedList()
s += wasOrderedList ? "<ol>\n" : "<ul>\n"
} else if lastType.isListItem() && !block.type.isListItem() {
s += wasOrderedList ? "</ol>\n\n" : "</ul>\n\n"
s += block.toHTML()
lastType = block.type
return s
# Barfdown: a Markdown-ish Parser Written in Swift
Goals for this project:
- Parse a simplified version of Markdown that is good enough for writing my blog posts.
- Be reasonably efficient. This means the parser shouldn't copy substrings around if not necessary. This is done by storing all the elements as indexes into the original text.
- Be small and therefore be easy to understand.
- No regular expressions. They are the lazy person's solution to parsing. ;-)
This is just a toy project for me to experiment with writing parsers in Swift. Because why not?
There may be bugs.
## Supported Markdown
The Markdown tags that are currently supported:
# Headers
1. numbered
2. list
- unordered
- list
> quote
[Link text](http://url)

source code (indented 1 tab or 4 spaces)
\* literal asterisk
\[ literal [
\] literal ]
Not supported are:
- Tables
- Horizontal rules
- Nested lists
- Unordered lists starting with `*`
- Headers that are underlined
- Line breaks (line ending in two spaces)
- Probably lots of other stuff...
This could really do with a good test suite. ;-)
## How to use it
Create a new `Markdown` instance and give it a `String`. The parser creates a tree structure that describes the Markdown document. You can either step through that tree yourself or simply call `renderHTML()` to convert it to HTML.
if let data = NSData(contentsOfFile: path) {
if let text = NSString(data: data, encoding: NSUTF8StringEncoding) {
let m = Markdown(string: text as String)
let s = m.renderHTML()
print("<!DOCTYPE html><html><head><meta charset=\"utf-8\"/></head><body>\n")
print(s, appendNewline: false)
## Why not regexps?
They definitely make parsing easier but I don't like throwing a handful of regexps at a parsing problem.
When you use a regular expression, it is turned into a state machine by the regex parser. Here I've basically "unrolled" all those state machines by hand.
变量已编写但从未在 Swift 2.0 和 Xcode 7 中使用