代码之家  ›  专栏  ›  技术社区  ›  user3600107

在Scheme中优化CSV解析器

  •  0
  • user3600107  · 技术社区  · 1 年前

    我目前正在尝试学习Scheme(特别是chicken Scheme),并希望更好地了解该语言的性能陷阱。我已经写了一个CSV解析器,并在下面分享了它。我尝试使用的测试文件是130MB,解析大约需要7分钟。简单地读取所有行只需要几毫秒,所以问题在于解析器。我试图让我的代码尽可能地“Lispy”,并希望避免引入太多我知道在Chicken Scheme中可用的低级构造。我希望从改进这段代码中获得的主要东西是Scheme性能背后更好的直觉。

    编辑:我从评论中得到了一些建议,现在有两个独立的解析器。第一个代码块使用基于列表的解决方案,第二个使用字符串引用。我原以为由于线性记忆,字符串refs会更快,但实验只让我感到困惑。但这是我试图解决的核心问题。我真的不明白为什么一个解决方案比另一个更好。任何见解都会很棒。

    我还回去对性能进行了一些测量:

    列表解决方案已编译: 37.658s CPU time, 1.2s GC time (major), 258575354/244106 mutations (total/tracked), 1160/317431 GCs (major/minor), maximum live heap: 306.33 KiB

    已编译字符串引用解决方案: 644.939s CPU time, 1.308s GC time (major), 167854392/243430 mutations (total/tracked), 1281/1023116 GCs (major/minor), maximum live heap: 305.51 KiB

    ; List based solution
    (import (chicken string))
    (import utf8)
    (import list-utils)
    
    (define (lookahead-list ahead lst) 
      (cond [(null? ahead) #t]
        [(and (not (null? lst)) (not (null? ahead))) 
         (if (eq? (car ahead) (car lst)) (lookahead-list (cdr ahead) (cdr lst)) #f)]
        [else #f]
      )
    )
    
    (define (null-blist? blist) (null? (car blist)))
    
    (define (lookahead-blist ahead blst) (lookahead-list ahead (car blst)))
    
    (define (read-blist blist)
        (if (null? blist) #!eof
            (let ([head (caar blist)]) (set-car! blist (cdar blist)) head))
    )
    
    ; Csv parsing
    
    
    (define (csv/read-atom blist)
        (let loop ([res '()] [cmode #t]) 
          (cond [(lookahead-blist '(#\" #\") blist) (read-blist blist) (loop (cons (read-blist blist) res) cmode)]
            [(lookahead-blist '(#\") blist) (read-blist blist) (loop res (not cmode))]
            [(and cmode (lookahead-blist '(#\,) blist)) (reverse-list->string res)]
            [(null-blist? blist) (reverse-list->string res)]
            [else (loop (cons (read-blist blist) res) cmode)]
          ))
    )
    
    
    (define (csv/parse-non-blank-line blist)
        (reverse 
        (let loop ([res '()])
            (let ([nres (cons (csv/read-atom blist) res)])
                 (if (lookahead-blist '(#\,) blist) 
                   (begin (read-blist blist) (loop nres)) nres)
            )
        ))
    )
    
    (define (csv/parse-line str)
        (if (equal? str "") '() (csv/parse-non-blank-line (list (string->list str))))
    )
    
    (define (csv/parse func init port)
        (let loop ([acc init] [line (read-line port)])
            (if (eq? line #!eof) acc
                (loop (func acc (csv/parse-line line)) (read-line port))
            )
        )
    )
    
    (define (csv/parse-table func init port) 
        (let ([format (csv/parse-line (read-line port))]) 
            (define (shim acc elem) 
                (func acc (zip-alist format elem))
            )
            (csv/parse shim init port)
        )
    )
    
    
    (define (csv/parse-table-file func init fname) (call-with-input-file (lambda (p) (csv/parse-table func init p))))
    
    ; String-ref based solution
    (import (chicken io))
    (import (chicken string))
    (import utf8)
    (import list-utils)
    
    
    ; Cursor string
    (define (string->cstring str)
        (cons 0 str)
    )
    
    (define (cstring/forward cstr) 
        (cons (+ 1 (car cstr)) (cdr cstr))
    )
    
    (define (cstring/peek cstr)
        (if (cstring/null? cstr)
            #!eof
            (string-ref (cdr cstr) (car cstr))
        )
    )
    
    (define (cstring/forward! cstr)
        (let ([ret (cstring/peek cstr)]) 
            (set-car! cstr (+ 1 (car cstr)))
            ret
        )
    )
    
    (define (cstring/null? cstr)
        (>= (car cstr) (string-length (cdr cstr)))
    )
    
    (define (lookahead-cstring ahead cstr) 
      (cond [(null? ahead) #t]
        [(and (not (cstring/null? cstr)) (not (null? ahead))) 
         (if (eq? (car ahead) (cstring/peek cstr)) (lookahead-cstring (cdr ahead) (cstring/forward cstr)) #f)]
        [else #f]
      )
    )
    
    
    
    ; Csv parsing
    
    
    (define (csv/read-atom cstr)
        (let loop ([res '()] [cmode #t]) 
          (cond [(lookahead-cstring '(#\" #\") cstr) (cstring/forward! cstr) (loop (cons (cstring/forward! cstr) res) cmode)]
            [(lookahead-cstring '(#\") cstr) (cstring/forward! cstr) (loop res (not cmode))]
            [(and cmode (lookahead-cstring '(#\,) cstr)) (reverse-list->string res)]
            [(cstring/null? cstr) (reverse-list->string res)]
            [else (loop (cons (cstring/forward! cstr) res) cmode)]
          ))
    )
    
    
    
    (define (csv/parse-non-blank-line cstr)
        (reverse 
        (let loop ([res '()])
            (let ([nres (cons (csv/read-atom cstr) res)])
                 (if (lookahead-cstring '(#\,) cstr) 
                   (begin (cstring/forward! cstr) (loop nres)) nres)
            )
        ))
    )
    
    (define (csv/parse-line str)
        (if (equal? str "") '() (csv/parse-non-blank-line (string->cstring str)))
    )
    
    (define (csv/parse func init port)
        (let loop ([acc init] [line (read-line port)])
            (if (eq? line #!eof) acc
                (loop (func acc (csv/parse-line line)) (read-line port))
            )
        )
    )
    
    (define (csv/parse-table func init port) 
        (let ([format (csv/parse-line (read-line port))]) 
            (define (shim acc elem) 
                (func acc (zip-alist format elem))
            )
            (csv/parse shim init port)
        )
    )
    
    
    (define (csv/parse-table-file func init fname) (call-with-input-file fname (lambda (p) (csv/parse-table func init p))))
    
    
    0 回复  |  直到 1 年前
        1
  •  0
  •   wasamasa    1 年前

    CHICKEN Scheme可能会表现出病态的糟糕性能行为,因为代码对垃圾收集器的压力过大。您可以尝试调整GC参数(请参阅 -:? 可以传递给程序的运行时选项的完整列表的标志),或者重写程序以避免产生这么多垃圾。例如,评论中的建议都是用字符阅读 read-char / peek-char 整行阅读会有所帮助。

    就我个人而言,我确实认为使用特定于实现的特性没有错。你不必全力以赴。例如,您可以使用 cond-expand 介绍特定于实现的代码并定义 csv/read-line 使用的助手过程 read-line 从…起 (chicken io) 或以可移植Scheme编写的变体,使用 读取字符 peek字符 Scheme非常适合语言实验,所以不要害怕尝试各种策略,看看它们在可读性/可移植性/性能/。。。