; Copyright (c) 2004, 2010 James Bailie <jimmy@mammothcheese.ca>.
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
;     * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
;     * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in the
; documentation and/or other materials provided with the distribution.
;     * The name of James Bailie may not be used to endorse or promote
; products derived from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS IS"
; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
; ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
; LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
; CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
; ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
; POSSIBILITY OF SUCH DAMAGE.

; This filter emulates the "fmt" utility.  It is slower than the system fmt
; because this program uses regular expressions.

; The program recognizes three optional arguments:

; -l n specifies the desired line length of output lines.  Defaults to 75
;      characters.

; -t n specifies where tabstops occur.  Defaults to every 8 columns.

; -p   by itself indicates the script should only format the text appearing
;      after a prefix string consisting of non-alphanumeric characters,
;      optionally delimited by whitespace.  The prefix string is extracted
;      from the first input line and printed at the beginning of all output
;      lines.  This option can also be used to cause the leading whitespace
;      of the first input line to be propagated to all the output lines,
;      which means with this option, you can prevent leading whitespace
;      from being consumed during the reflow operation.
;      Prefixes are not recognized by default.

#include "options.mm"

(declare len 75)            ; Default desired line length of output lines.
(declare tabstop 8)         ; Default tabstop periodicity.
(declare prefix "")         ; Variable to hold quote or comment
                            ; prefix if we are recognizing one.

(declare current_line "")   ; Working variable holding text accumulated
                            ; from the current paragraph.

; Regexp used to detect quote and comment prefixes, and indentation.

(declare prefix_rx (regcomp "^([\b\t]*[^A-Za-z0-9\b\t\"'`&<(]*[\b\t]*)([^\b\t])?"))

; Regexps used to detect leading and trailing whitespace, and strings composed
; entirely of whitespace.

(declare trail_rx (regcomp "\b*$"))
(declare lead_rx (regcomp "^\b+(.*)"))
(declare space_rx (regcomp "^\b*$"))

; Regexp used to find the breakpoint when wrapping long lines.

(declare wrap_rx (regcomp "^(.*[^\b])?\b+([^\b]+)?$"))

; Regexps used to detect sentence endpoints.  We insert two spaces between
; sentences we join together.

(declare start_rx (regcomp "^\b*[\"`]?[A-Z]"))
(declare end_rx (regcomp "[.?!]['\"]?\b*$"))

; Regexp used to detect lines ending with a colon.  We insert two spaces
; between a line ending with a colon and the subsequent line, if we are
; joining the two lines together.

(declare colon_rx (regcomp ":\b*$"))

; Check for options.  Don't allow a value <= 0.

(getopt (push (stack) "p"))
(declare arg 0)

(when (bind arg (lookup options "l"))
   (bind len (or (abs (digitize arg)) 75)))

(when (bind arg (lookup options "p"))
   (bind prefix arg))

(when (bind arg (lookup options "t"))
   (bind tabstop (or (abs (digitize arg)) 8)))

; These functions are called to continue to process unprocessed input for
; the current paragraph, after we have stopped receiving input for the
; current paragraph.

(declare find_white
   (lambda (m len before after)
      (if (and (not m) len)

         (let ((before (concat before (substring after 0 1)))
               (len (- (length after) 1)))

            (find_white (matches wrap_rx before)
                        len
                        before
                        (if len (substring after 1 0) "")))

         (let ((s (stack)))
            (push s m)
            (push s after)
            s))))

(declare process_line
   (lambda (line)

      ; First section appends new input line to stored text, with appropriate
      ; separator.  Trailing whitespace is removed from input lines.

      (bind current_line
         (concat  current_line

                  (if (or (eq line "") (eq current_line ""))
                     ""
                     " ")

                  (substitute trail_rx "" line)))

      ; Second section is executed only when we have enough stored text to split
      ; off a new line of the specified width.

      (when (> (length current_line) len)

         ; The stored text is split at the desired width.

         (letn ((before (substring current_line 0 len))
                (after (substring current_line len 0))
                (m (matches lead_rx after)))

            (if m

               ; This clause is executed if the stored text splits naturally
               ; before a chunk of whitespace.  If the part of the stored text
               ; before the split is not whitespace, we print it as a new line,
               ; trimming any trailing whitespace, and update the stored text.

               (progn
                  (unless (match space_rx before)
                     (print prefix (substitute trail_rx "" before) (char 10)))
                  (bind current_line (index m 1)))

               ; Otherwise if wrap_rx does not match the segment of the stored
               ; text before the split location, that means there is no
               ; whitespace in that segment, so we move the location of the split
               ; forward until we either run out of characters, or we find some
               ; whitespace to break the line at.

               (letn ((before (concat before (substring after 0 1)))
                      (len (- (length after) 1))
                      (r (find_white (matches wrap_rx before)
                                     len
                                     before
                                     (if len (substring after 1 0) "")))
                      (m (index r 0))
                      (after (index r 1)))

                  (if (not m)

                     ; This clause executes if there is no whitespace left in the
                     ; stored input text.  We just spit it all out as a one big
                     ; line, and clear out the stored text.

                     (progn
                        (print prefix current_line (char 10))
                        (bind current_line ""))

                     ; Otherwise, we print the segment of the stored text before the
                     ; (possibly new) split location and update the stored text
                     ; text.

                     (when (index m 1) (print prefix (index m 1) (char 10)))
                     (bind current_line (concat (index m 2) after)))))))))

(declare print_rest_helper
   (lambda (result)
      (when result
         (print_rest_helper (process_line "")))))

(declare print_rest
   (lambda (last)

      ; Feed process_line dummy lines, to get it to work through the remaining
      ; stored text.

      (print_rest_helper 1)

      ; process_line will return 0 when there is insufficient stored text left to
      ; break into two lines.  This means a last short line may still be present
      ; in current_line.  If so, we print it out.

      (when (not (eq current_line ""))
         (print prefix current_line (char 10)))

      ; If this function is called with a "last" argument of 0, then the script
      ; has encountered a blank line between paragraphs, so we need to print a
      ; blank line and clear out the last line of input data.  Otherwise, the
      ; script has reached the end of input data, and we need do nothing more.

      (unless last
         (print prefix (char 10))
         (bind current_line ""))))

; Wrapper function for process_line, which detects paragraph-separating blank
; lines, and removes quote and comment prefixes if we are recognizing them.

(declare wrapper
   (lambda (line)

      (when line

         ; Remove terminators and expand tabs.

         (let ((line (expand_tabs tabstop (chomp line))))

            ; If we're recognizing prefixes and this is the first line read, record the
            ; prefix and alter the line length as necessary.

            (when (eq prefix 1)
               (bind prefix (index (matches prefix_rx line) 1))

               ; If the prefix found is longer than the specified line length, ignore
               ; the request to recognize prefixes.

               (let ((old len))
                  (when (<= (bind len (- len (length prefix))) 0)
                     (bind prefix "")
                     (bind len old))))

            ; If we're recognizing prefixes, remove the prefix from the line.

            (let ((line (if prefix (substitute prefix_rx "\2" line) line)))

               ; Do we have a paragraph separating blank line, or text?

               (if (match space_rx line)
                  (print_rest 0)
                  (process_line line))))

         (wrapper (getline)))))

(declare foreach_source
   (lambda (file)
      (when file
         (with_input_file file
            (wrapper (getline)))
         (foreach_source (next)))))

(if (next)
   (foreach_source (current))
   (wrapper (getline)))

(print_rest 1)
