#! /usr/bin/gosh
;;;
;;; scbayes - simple front-end of scmail/bayesian-filter
;;;
;;;  Copyright(C) 2003 by Shiro Kawai (shiro@acm.org)
;;;
;;;  Permission to use, copy, modify, distribute this software and
;;;  accompanying documentation for any purpose is hereby granted,
;;;  provided that existing copyright notices are retained in all
;;;  copies and that this notice is included verbatim in all
;;;  distributions.
;;;  This software is provided as is, without express or implied
;;;  warranty.  In no circumstances the author(s) shall be liable
;;;  for any damages arising out of the use of this software.
;;;

(use gauche.parseopt)
(use gauche.parameter)
(use gauche.version)
(use dbm)
(use dbm.gdbm) ;; should be customizable
(use util.digest)
(use srfi-13)
(use srfi-19)
(use rfc.md5)
(use file.util)
(use scmail.bayesian-filter)
(use scmail)
(use scmail.mail)
(use scmail.mailbox)
(use scmail.config)
(use scmail.progress)
(use scmail.util)

;; NB: this script is just for an experiment of bayesian filtering, and
;; not intended to be a robust tool for day-to-day use, although it's quite
;; possible that this will eventually evolve to such a tool.

;; Usage:
;;   scbayes --learn-nonspam folder
;;   scbayes --learn-spam folder
;;   scbayes --check-mail path
;;   scbayes --check-folder folder

(define command (lambda () (usage)))

(define *table-file* #f)
(define (table-file)
  (or *table-file* (scmail-config-get-path 'token-table)))

(define (check-folder-path mailbox folder)
  (unless (or (and (absolute-path? folder) (file-is-directory? folder))
              (file-is-directory? 
               (scmail-mailbox-folder->path mailbox folder)))
          (scmail-eformat "no such folder: ~a" folder)))

(define (usage)
  (print #`"Usage: ,(sys-basename *program-name*) [options] command")
  (print "  Mandatory commands (exclusive):")
  (print "   To build a table: ")
  (print "   * A directory in absolute path can be specified instead of a folder.")
  (print "    --learn-nonspam folder...   Learn mails in the folder(s) as non-spams")
  (print "    --learn-spam folder...      Learn mails in the folder(s) as spams")
  (print "    --unlearn-nonspam folder... Unlearn mails in the folder(s) as non-spams")
  (print "    --unlearn-spam folder...    Unlearn mails in the folder(s) as spams")
  (print "   To test:")                   
  (print "    --check-mail path           Show spamness of the mail")
  (print "    --check-spam folder         Check spamness of mails in the folder")
  (print "    --check-nonspam folder      Check spamness of mails in the folder")
  (print "    --table-stat                Prints # of entries in the table")
  (print "    --dump-table                Dump entries in the table")
  (print "    --dump-digest               Dump entries in the digest DB")
  (print "  Options")                     
  (print "    --force                     Don't skip mails already digest")
  (print "    --slow                      Learn slowly with periodic sleep")
  (print "    --flush-interval num        Specify interval of flushing [unlimited]")
  (print "    --table file                Specify alternative DB file")
  (print "    --digest file               Specify alternative digest DB file")
  (print "    -c, --config file               Specify alternative config file")
  (print "    -d, --scmail-dir dir            Specify scmail's directory")
  (print "    -v, --verbose                   Work noisily (diagnostic output)")
  (print "    -q, --quiet                     Suppress all normal output")
  (print "    -h, --help                      Display this help and exit")
  (exit 0))

(define (temporary-table-file)
  (string-append (table-file) ".tmp"))

(define (lock-file)
  (string-append (table-file) ",lock"))

(define (prepare-temporary-files)
  (define (copy progress src dest)
    (call-with-output-file dest
      (lambda (oport)
        (call-with-input-file src
          (lambda (iport)
            (let loop ((block (read-block 8192 iport))
                       (i 1))
              (unless (eof-object? block)
                      (begin
                        (progress-inc! progress (string-length block))
                        (display block oport)
                        (if (and (slow?) (= (modulo i 128) 0)) (sys-sleep 1))
                        (loop (read-block 8192 iport) (+ i 1))))))))))


  (when (and (not (file-exists? (table-file)))
             (file-exists? (digest-file)))
        (scmail-wformat "~a is found while ~a is not found." 
                        (digest-file) (table-file))
        (scmail-eformat "(Please remove ~a if you don't need it.)"
                        (digest-file)))
      
  (when (file-exists? (table-file))
        (let1 progress (make <progress> 
                           :title "prepare"
                           :total (+ (file-size (table-file))
                                     (if (file-exists? (digest-file))
                                         (file-size (digest-file))
                                         0)))
              (copy progress (table-file) (temporary-table-file))
              (if (file-exists? (digest-file))
                  (copy progress (digest-file) (temporary-digest-file)))
              (progress-finish! progress))))

(define (swap-files)
  (with-signal-handlers
   (((list SIGINT SIGHUP SIGTERM) => #f))
   (lambda ()
     (sys-rename (temporary-table-file) (table-file))
     (sys-rename (temporary-digest-file) (digest-file)))))

(if (version<? (gauche-version) "0.9")
    (define (lock)
      (unless (eq? (create-directory* (lock-file)) #t)
        (scmail-wformat "~a is now being updated" (table-file))
        (scmail-wformat "or perhaps ~a is staled." (lock-file))
        (scmail-eformat "(Please remove the lock file if it is staled.)")))
    (define (lock)
      (guard (e (else
                 (scmail-wformat "~a is now being updated" (table-file))
                 (scmail-wformat "or perhaps ~a is staled." (lock-file))
                 (scmail-eformat "(Please remove the lock file if it is staled.)")))
        (create-directory* (lock-file)))))

(define force-learn? (make-parameter #f))

(define slow? (make-parameter #f))

(define *digest-file* #f)

(define (digest-file)
  (or *digest-file* (scmail-config-get-path 'digest)))

(define digest-db (make-parameter #f))

(define digest-cache (make-parameter (make-hash-table 'string=?)))

(define (temporary-digest-file)
  (string-append (digest-file) ".tmp"))

(define (add-to-digest-db! key value)
  (dbm-put! (digest-db) key value))

(define (delete-from-digest-db! key value)
  (dbm-delete! (digest-db) key))

(define (learned? digest)
  (if (or (hash-table-exists? (digest-cache) digest)
          (and (digest-db) (dbm-exists? (digest-db) digest)))
      #t
      #f))

(define (not-learned? digest)
  (not (learned? digest)))

(define (mail-digest mail)
  (let1 md5 (make <md5>)
        (for-each (lambda (name) (digest-update! md5 
                                                 (scmail-mail-query mail name)))
                  '(date from message-id subject))
        (digest-hexify (digest-final! md5))))

(define (add-to-digest-cache! digest)
  (hash-table-put! (digest-cache) digest (number->string (sys-time))))

(define-constant *dbm-class* <gdbm>) ;; should be customizable
(define (open-digest-db file)
  (with-error-handler 
   (lambda (e) 
     (scmail-eformat "~a" (ref e 'message)))
   (lambda ()
     (digest-db 
      (dbm-open *dbm-class* :path file :rw-mode :write)))))

(define (collect-target-files mailbox folder)
  (if (and (absolute-path? folder)
           (file-is-directory? folder))
      (directory-list folder
                      :children? #t
                      :add-path? #t
                      :filter (lambda (x)
                                (file-is-regular?
                                 (build-path folder
                                             x))))
      (scmail-mailbox-mail-list mailbox folder)))

(define (collect-target-files-from-folders mailbox folders)
  (apply append
         (map (lambda (folder)
                (collect-target-files mailbox folder))
                folders)))

(define flush-interval (make-parameter 0)) ;; unlimited

(define (learn-common table-type folders 
                      task-name skip? process-words update-digest-db!)

  (define (flush-token-table-cache progress)
    (token-table-cache-flush 
     (lambda (i) 
       (progress-inc! progress)
       (if (and (slow?) (= (modulo i 100) 0)) (sys-sleep 1))
       )))

  (define (flush-digest-cache progress)
    (let1 counter 1
          (hash-table-for-each 
           (digest-cache)
           (lambda (key value)
             (update-digest-db! key value)
             (progress-inc! progress)
             (if (and (slow?) (= (modulo counter 100) 0)) (sys-sleep 1))
             (inc! counter)))
          (digest-cache (make-hash-table 'string=?))))

  (define (flush-both-cache)
    (let1 total (+ (length (hash-table-keys (digest-cache)))
                   (token-table-cache-length))
          (if (> total 0)
              (let1 progress (make <progress> 
                               :title "flush"
                               :total total
                               :bar-mark #\.)
                    (flush-token-table-cache progress)
                    (flush-digest-cache progress)
                    (progress-finish! progress)))))

  (define (learn-files files)
    (let ((learned-file-count 0)
          (progress (make <progress> 
                      :title task-name
                      :total (length files))))
      (for-each (lambda (file) 
                  (let* ((mail (make <mail> :file file))
                         (digest (mail-digest mail)))
                    (if (or (force-learn?) (not (skip? digest)))
                        (with-error-handler
                         (lambda (e)
                           (scmail-wformat "~a: ~a" file (ref e 'message)))
                         (lambda ()
                           (process-words mail table-type)
                           (add-to-digest-cache! digest)
                           (inc! learned-file-count)
                           (if (slow?) (sys-sleep 1))
                           (when (and (> (flush-interval) 0)
                                      (= (modulo learned-file-count 
                                                 (flush-interval)) 0))
                                 (newline) 
                                 (flush-both-cache))
                           (scmail-dformat "~a: ~a: ~a" 
                                           learned-file-count task-name file)
                           ))
                        (scmail-dformat "skip: ~a" file))
                    (progress-inc! progress)))
                files)
      (progress-finish! progress)
      learned-file-count))

  ;; FIXME: copied from progress.scm
  (define (time-difference->real time0 time1)
    (let1 time (time-difference time0 time1)
          (+ (time-second time)
             (/ (time-nanosecond time) 1000000000))))

  (define (.xx number)
    (/ (round (* number 100)) 100))

  (define (report start-time learned-file-count)
    (let1 elapsed (time-difference->real (current-time) start-time)
          (format #t "summary:   ~:d ~a mails ~a ~aed in ~:d sec. (~a mails/sec.)\n"
                      learned-file-count
                      (if (eq? table-type token-table-index-of-spam)
                          "spam"
                          "nonspam")
                      (if (<= learned-file-count 1) "is" "are")
                      task-name
                      (.xx elapsed)
                      (if (> elapsed 0)
                          (.xx (/ learned-file-count elapsed)) 
                          "NaN")
                      )))

  (define (cleanup)
    (if (file-exists? (temporary-table-file))
        (sys-unlink (temporary-table-file)))
    (if (file-exists? (temporary-digest-file))
        (sys-unlink (temporary-digest-file)))
    (sys-rmdir (lock-file)))
  
  (let1 mailbox (make-scmail-mailbox (ref (scmail-config) 'mailbox-type)
                                     (ref (scmail-config) 'mailbox))
        (scmail-config-make-directory)
        (for-each (lambda (folder) (check-folder-path mailbox folder)) folders)
        (lock)
        (let ((learned-file-count 0)
              (start-time (current-time)))
          (dynamic-wind
              (lambda () #t)
              (lambda ()
                (prepare-temporary-files)
                (open-digest-db (temporary-digest-file))
                (with-token-table
                 (temporary-table-file) :write
                 (lambda ()
                   (let ((target-files
                          (collect-target-files-from-folders mailbox folders)))
                     (set! learned-file-count (learn-files target-files))
                     (flush-both-cache)
                     )))
                (swap-files)
                (report start-time learned-file-count)
                )
              (lambda () 
                (cleanup)
                )))))

(define (learn table-type folders)
  (learn-common table-type folders 
                "learn" learned? token-table-collect-words
                add-to-digest-db!))

(define (unlearn table-type folders)
  (learn-common table-type folders 
                "unlearn" not-learned? token-table-discard-words
                delete-from-digest-db!))

(define (check-spamness-of-mail file)
  (unless (file-is-readable? file)
    (scmail-eformat "can't read ~a" file))
  (with-token-table
   (table-file) :read
   (lambda ()
     (receive (prob lang words) 
              (spamness-of-mail 
               (make <mail> :file file))
              (print file)
              (print #`"  ,prob")
              (for-each (lambda (w) 
                          (print #`"    ,(car w) : ,(cdr w)"))
                        words)
              prob)))
  0)

;; (test-spamness-of-files nonspam-files #f)
;; (test-spamness-of-files spam-files #t)
(define (test-spamness-of-files files expect-spam?)
  (let ((threshold 0.9)
        (count 0)
        (bad  '())
        (mailbox-type (ref (scmail-config) 'mailbox-type)))
    (for-each (lambda (file)
                (let1 mail (make-scmail-mail mailbox-type :file file)
                      (inc! count)
                      (receive (prob lang words) (spamness-of-mail mail)
                               (when (or (and expect-spam?
                                              (< prob threshold))
                                         (and (not expect-spam?)
                                              (>= prob threshold)))
                                     (push! bad (list file prob words))
                                     ))))
	      files)
    (print #`"Out of ,count messages")
    (print #`"  ,(length bad) messages are identified incorrectly:")
    (for-each (lambda (entry)
                (print 
                 #`"  ,(sys-basename (car entry)) (score=,(cadr entry)):")
                (for-each (lambda (w)
                            (print #`"    ,(car w) : ,(cdr w)"))
                          (caddr entry)))
              bad)
    (length bad)
    ))

(define (check-spamness-in-folder folder spam?)
  (let ((number-of-incorrect-answers 0)
        (mailbox (make-scmail-mailbox (ref (scmail-config) 'mailbox-type)
                                      (ref (scmail-config) 'mailbox))))
    (check-folder-path mailbox folder)
    (with-token-table
     (table-file) :read
     (lambda ()
       (set! number-of-incorrect-answers
             (test-spamness-of-files (collect-target-files mailbox folder)
                                     spam?))))
    number-of-incorrect-answers))

(define (table-stat)
  (with-token-table 
   (table-file) :read 
   (lambda ()
     (let ((mcount (token-table-message-count))
           (tcount (token-table-token-count))
           (totals (make-list (token-table-number-of-values) 0)))
       (format #t "lang       nonspam           spam\n")
       (dolist (lang (token-table-languages))
               (let1 v (list (ref tcount (token-table-index-of-nonspam lang))
                             (ref mcount (token-table-index-of-nonspam lang))
                             (ref tcount (token-table-index-of-spam lang))
                             (ref mcount (token-table-index-of-spam lang)))
                     (apply format #t "~4a: ~7dw/~5dm  ~7dw/~5dm\n" lang v)
                     (set! totals (map + totals v))))
       (apply format #t "total:~7dw/~5dm  ~7dw/~5dm\n" totals))))
  0)

(define (dump-table)
  (with-token-table 
   (table-file) :read
   (lambda ()
     (token-table-for-each 
      (lambda (key value)
        (unless (string-prefix? (token-table-special-key-prefix) key)
                (format #t "~a\t~a\n" key value))))))
  0)

(define (dump-digest)
  (open-digest-db (digest-file))
  (dbm-for-each (digest-db)
                (lambda (key value)
                  (format #t "~a\t~a\n" key value)))
  0)

;;(define (update-db old-db)
;;  (format #t #`"converting ,old-db to ,(table-file)...")
;;  (flush)
;;  (convert-database old-db (table-file))
;;  (print "done.")
;;  0)


(define (main args)
  (define folders '())

  (define (get-folders)
    (if (null? folders)
        (scmail-eformat "one or more folders should be specified"))
    folders)

  (define (learn-internal table)
    (learn table (get-folders))
    0)
  (define (unlearn-internal table)
    (unlearn table (get-folders))
    0)

  (scmail-set-program-name! (car args))
  (scmail-check-gauche-version)
  (let* ((config-file (scmail-config-default-file))
         (verbose-mode? #f)
         (quiet-mode? #f)
         (rest
          (parse-options
           (cdr args)
           (("learn-nonspam" ()
             (set! command (lambda ()
                             (learn-internal token-table-index-of-nonspam))))
            ("learn-spam" ()
             (set! command (lambda ()
                             (learn-internal token-table-index-of-spam))))
            ("unlearn-nonspam" ()
             (set! command (lambda ()
                             (unlearn-internal token-table-index-of-nonspam))))
            ("unlearn-spam" ()
             (set! command (lambda ()
                             (unlearn-internal token-table-index-of-spam))))
            ("check-mail=s" (file)
             (set! command (lambda () (check-spamness-of-mail file))))
            ("d|scmail-dir=s" (dir)
             (scmail-config-set-directory! dir))
            ("check-nonspam=s" (folder)
             (set! command (lambda () (check-spamness-in-folder folder #f))))
            ("check-spam=s" (folder)
             (set! command (lambda () (check-spamness-in-folder folder #t))))
            ("dump-table" ()
             (set! command (lambda () (dump-table))))
            ("dump-digest" ()
             (set! command (lambda () (dump-digest))))
            ("table-stat" ()
             (set! command (lambda () (table-stat))))
            ;;("update-db=s" (old-db)
            ;; (set! command (lambda () (update-db old-db))))
            ("c|config=s" (file)
             (set! config-file file))
            ("table=s" (table)
             (set! *table-file* (expand-path table)))
            ("digest=s" (file)
             (set! *digest-file* (expand-path file)))
            ("force" () (force-learn? #t))
            ("slow" () (slow? #t))
            ("flush-interval=i" (i) (flush-interval i))
            ("v|verbose" () (set! verbose-mode? #t))
            ("q|quiet" () (set! quiet-mode? #t))
            ("h|help" () (usage))
            (else _ (usage))))))

    (if verbose-mode? (scmail-config-set-verbose-mode!))
    (set! folders rest)
    (with-output-to-port (if quiet-mode? 
                             (open-output-file "/dev/null")
                             (standard-output-port))
      (lambda () 
        (scmail-config-read config-file)
        (command)))
    ))

;; Local variables:
;; mode: scheme
;; end:
