clojure simple markov data transform

Question 1

Given you have words:

(def words ["john" "said" "lara" "chased" "john" "walked" "lara" "chased"])

Use this transformation-fn

(defn transform
  [words]
  (->> words
       (partition 2 1)
       (reduce (fn [acc [w next-w]]
                 ;; could be shortened to #(update-in %1 %2 (fnil inc 0))
                 (update-in acc
                            [w next-w]
                            (fnil inc 0))) 
               {})))

(transform words)
;; {"walked" {"lara" 1}, "chased" {"john" 1}, "lara" {"chased" 2}, "said" {"lara" 1}, "john" {"walked" 1, "said" 1}}

EDIT: You can gain performance using transient hash-maps like this:

(defn transform-fast
  [words]
  (->> (map vector words (next words))
       (reduce (fn [acc [w1 w2]]
                 (let [c-map (get acc w1 (transient {}))]
                   (assoc! acc w1 (assoc! c-map w2
                                          (inc (get c-map w2 0))))))
               (transient {}))
       persistent!
       (reduce-kv (fn [acc w1 c-map]
                    (assoc! acc w1 (persistent! c-map)))
                  (transient {}))
       persistent!))

Obviously the resulting source code doesn't look as nice and such optimization should only happen if it is critical.

(Criterium says it beats Michał Marczyks transform* being roughly two times as fast on King Lear).

Question 2

(Update: see below for a solution using java.util.HashMap for the intermediate products -- end result still being fully persistent -- which is the fastest yet, with a factor of 2.35 advantage over transform-fast in the King Lear benchmark.)

`merge-with`-based solution

Here's a faster solution, by a factor of roughly 1.7 on words taken from King Lear (see below for the exact methodology), almost 3x on the sample words:

(defn transform* [words]
  (apply merge-with
         #(merge-with + %1 %2)
         (map (fn [w nw] {w {nw 1}})
              words
              (next words))))

The function passed to map could alternatively be written

#(array-map %1 (array-map %2 1)),

although the timings with this approach aren't quite as good. (I still include this version in the benchmark below as transform**.)

First, a sanity check:

;; same input
(def words ["john" "said" "lara" "chased" "john"
            "walked" "lara" "chased"])

(= (transform words) (transform* words) (transform** words))
;= true

Criterium benchmark using the test input (OpenJDK 1.7 with -XX:+UseConcMarkSweepGC):

(do (c/bench (transform words))
    (flush)
    (c/bench (transform* words))
    (flush)
    (c/bench (transform** words)))
Evaluation count : 4345080 in 60 samples of 72418 calls.
             Execution time mean : 13.945669 µs
    Execution time std-deviation : 158.808075 ns
   Execution time lower quantile : 13.696874 µs ( 2.5%)
   Execution time upper quantile : 14.295440 µs (97.5%)
                   Overhead used : 1.612143 ns

Found 2 outliers in 60 samples (3.3333 %)
    low-severe   2 (3.3333 %)
 Variance from outliers : 1.6389 % Variance is slightly inflated by outliers
Evaluation count : 12998220 in 60 samples of 216637 calls.
             Execution time mean : 4.705608 µs
    Execution time std-deviation : 63.133406 ns
   Execution time lower quantile : 4.605234 µs ( 2.5%)
   Execution time upper quantile : 4.830540 µs (97.5%)
                   Overhead used : 1.612143 ns

Found 1 outliers in 60 samples (1.6667 %)
    low-severe   1 (1.6667 %)
 Variance from outliers : 1.6389 % Variance is slightly inflated by outliers
Evaluation count : 10847220 in 60 samples of 180787 calls.
             Execution time mean : 5.706852 µs
    Execution time std-deviation : 73.589941 ns
   Execution time lower quantile : 5.560404 µs ( 2.5%)
   Execution time upper quantile : 5.828209 µs (97.5%)
                   Overhead used : 1.612143 ns

Finally, the more interesting benchmark using King Lear as found on Project Gutenberg (not bothering to strip out the legal notices and such before processing):

(def king-lear (slurp (io/file "/path/to/pg1128.txt")))

(def king-lear-words
  (-> king-lear
      (string/lower-case)
      (string/replace #"[^a-z]" " ")
      (string/trim)
      (string/split #"\s+")))

(do (c/bench (transform king-lear-words))
    (flush)
    (c/bench (transform* king-lear-words))
    (flush)
    (c/bench (transform** king-lear-words)))
Evaluation count : 720 in 60 samples of 12 calls.
             Execution time mean : 87.012898 ms
    Execution time std-deviation : 833.381589 µs
   Execution time lower quantile : 85.772832 ms ( 2.5%)
   Execution time upper quantile : 88.585741 ms (97.5%)
                   Overhead used : 1.612143 ns
Evaluation count : 1200 in 60 samples of 20 calls.
             Execution time mean : 51.786860 ms
    Execution time std-deviation : 587.029829 µs
   Execution time lower quantile : 50.854355 ms ( 2.5%)
   Execution time upper quantile : 52.940274 ms (97.5%)
                   Overhead used : 1.612143 ns
Evaluation count : 1020 in 60 samples of 17 calls.
             Execution time mean : 61.287369 ms
    Execution time std-deviation : 720.816107 µs
   Execution time lower quantile : 60.131219 ms ( 2.5%)
   Execution time upper quantile : 62.960647 ms (97.5%)
                   Overhead used : 1.612143 ns

`java.util.HashMap`-based solution

Going all out, it's possible to do even better using mutable hash maps for the intermediate state and loop / recur to avoid consing while looping over pairs of words:

(defn t9 [words]
  (let [m (java.util.HashMap.)]
    (loop [ws  words
           nws (next words)]
      (if nws
        (let [w  (first ws)
              nw (first nws)]
          (if-let [im ^java.util.HashMap (.get m w)]
            (.put im nw (inc (or (.get im nw) 0)))
            (let [im (java.util.HashMap.)]
              (.put im nw 1)
              (.put m w im)))
          (recur (next ws) (next nws)))
      (persistent!
       (reduce (fn [out k]
                 (assoc! out k
                         (clojure.lang.PersistentHashMap/create
                          ^java.util.HashMap (.get m k))))
               (transient {})
               (iterator-seq (.iterator (.keySet m)))))))))

clojure.lang.PersistentHashMap/create is a static method in the PHM class and is admittedly an implementation detail. (Not likely to change in the near future, though -- currently all map creation in Clojure for the built-in map types goes through static methods like this.)

Sanity check:

(= (transform king-lear-words) (t9 king-lear-words))
;= true

Benchmark results:

(c/bench (transform-fast king-lear-words))
Evaluation count : 2100 in 60 samples of 35 calls.
             Execution time mean : 28.560527 ms
    Execution time std-deviation : 262.483916 µs
   Execution time lower quantile : 28.117982 ms ( 2.5%)
   Execution time upper quantile : 29.104784 ms (97.5%)
                   Overhead used : 1.898836 ns

(c/bench (t9 king-lear-words))
Evaluation count : 4980 in 60 samples of 83 calls.
             Execution time mean : 12.153898 ms
    Execution time std-deviation : 119.028100 µs
   Execution time lower quantile : 11.953013 ms ( 2.5%)
   Execution time upper quantile : 12.411588 ms (97.5%)
                   Overhead used : 1.898836 ns

Found 1 outliers in 60 samples (1.6667 %)
    low-severe   1 (1.6667 %)
 Variance from outliers : 1.6389 % Variance is slightly inflated by outliers

clojure simple markov data transform

merge-with-based solution

java.util.HashMap-based solution

`merge-with`-based solution

`java.util.HashMap`-based solution