diff --git a/docs/src/reference/asciidoc/appendix.adoc b/docs/src/reference/asciidoc/appendix.adoc index a85f6fb2..208e0a7e 100644 --- a/docs/src/reference/asciidoc/appendix.adoc +++ b/docs/src/reference/asciidoc/appendix.adoc @@ -563,3 +563,19 @@ What's happening in above chart: * Jepsen will heal network and after some time nodes `n3/n4` will join back into ensemble and synchronize its distributed status. +=== Crash and Join Tolerance +In this test we will demostrate that killing existing state machine +and then joining new instance back into an ensemble will keep the +distributed state healthy and newly joined state machines will synchronize +their states properly. + +image::images/sm-tech-stop-start.png[width=500] + +What's happening in above chart: + +* All state machines are transitioned from initial state `S21` into + `S211` so that we can test proper state synchronize during join. +* `X` is marking when a specific node has been crashed and started. +* At a same time we request states from all machines and plot it. +* Finally we do a simple transition back to `S21` from `S211` to make + sure that all state machines are still functioning properly. diff --git a/docs/src/reference/asciidoc/images/sm-tech-stop-start.png b/docs/src/reference/asciidoc/images/sm-tech-stop-start.png new file mode 100644 index 00000000..6538af77 Binary files /dev/null and b/docs/src/reference/asciidoc/images/sm-tech-stop-start.png differ diff --git a/jepsen/spring-statemachine-jepsen/src/spring_statemachine_jepsen/checker.clj b/jepsen/spring-statemachine-jepsen/src/spring_statemachine_jepsen/checker.clj index 85e89124..d76965c8 100644 --- a/jepsen/spring-statemachine-jepsen/src/spring_statemachine_jepsen/checker.clj +++ b/jepsen/spring-statemachine-jepsen/src/spring_statemachine_jepsen/checker.clj @@ -12,6 +12,7 @@ [knossos.history :as history] [gnuplot.core :as g])) +(def nodetovalue {"n1" 1 "n2" 2 "n3" 3 "n4" 4 "n5" 5 }) (def statetovalue {"S11" 1 "S12" 2 "S211" 3 "S212" 4 }) (def variabletovalue {"v1" 1 "v2" 2 "v3" 3 "v4" 4 "v5" 5 "v6" 6 "v7" 7 "v8" 8}) @@ -65,6 +66,16 @@ (vector (reduce-kv (fn [vec key value] (conj vec (vector (get value :time) (+ (get value :process) 1) (get value :v))) ) [] (vec data))))) +(defn extract-plot-data5 + [history] + (let [data + (->> history + (filter #(:value %)) + (filter #(= :nemesis (:process %))) + (filter #(= :stop (:f %))))] + (vector + (reduce-kv (fn [vec key value] (conj vec (vector (get value :time) (get nodetovalue (name (last (keys (get value :value))))) "X")) ) [] (vec data))))) + (defn plot1! [test model history] @@ -129,6 +140,38 @@ output-path) {:valid? true}) +(defn plot3! + [test model history] + + (let [output-path (.getCanonicalPath (store/path! test "states.png"))] + (g/raw-plot! [[:set :key :outside] + [:set :style :textbox :opaque] + [:set :terminal :qt :size (keyword "900,450")] + [:set :yrange (keyword "[0.5:4.5]")] + [:set :y2range (keyword "[0.5:5.5]")] + [:set :xtics :format "%h\nns"] + [:set :xlabel "elapsed time"] + [:set :ylabel "states in nodes"] + [:set :y2label "crash/start in nodes"] + [:set :ytics 1] + [:set :ytics (keyword "('S21' 1, 'S22' 2, 'S211' 3, 'S212' 4)")] + [:set :ytics :nomirror] + [:set :y2tics 1] + [:set :y2tics (keyword "('n1' 1, 'n2' 2, 'n3' 3, 'n4' 4, 'n5' 5)")] + [:plot + (g/list ["-" :title "states n1" :with :steps :lw :3] + ["-" :title "states n2" :with :steps :lw :3] + ["-" :title "states n3" :with :steps :lw :3] + ["-" :title "states n4" :with :steps :lw :3] + ["-" :title "states n5" :with :steps :lw :3] + ["-" :title "crash" :with :labels :center :boxed :font ",15" :axis :x1y2] + )]] + (into + (extract-plot-data history) + (extract-plot-data5 history))) + output-path) + {:valid? true}) + (defn checker1 "Constructs a Jepsen checker." [] @@ -142,3 +185,10 @@ (reify Checker (check [_ test model history] (if (env :plot) (plot2! test model history) {:valid? true})))) + +(defn checker3 + "Constructs a Jepsen checker." + [] + (reify Checker + (check [_ test model history] + (if (env :plot) (plot3! test model history) {:valid? true})))) diff --git a/jepsen/spring-statemachine-jepsen/src/spring_statemachine_jepsen/core.clj b/jepsen/spring-statemachine-jepsen/src/spring_statemachine_jepsen/core.clj index e8ebf1e5..22a651f5 100644 --- a/jepsen/spring-statemachine-jepsen/src/spring_statemachine_jepsen/core.clj +++ b/jepsen/spring-statemachine-jepsen/src/spring_statemachine_jepsen/core.clj @@ -17,6 +17,7 @@ [tests :as tests]] [spring-statemachine-jepsen.checker :refer [checker1]] [spring-statemachine-jepsen.checker :refer [checker2]] + [spring-statemachine-jepsen.checker :refer [checker3]] [jepsen.checker.timeline :as timeline] [jepsen.control.net :as net] [jepsen.os.debian :as debian] @@ -300,8 +301,7 @@ (gen-send-event-variable "J" "v7") (gen-read-variable "v7") (gen-send-event-variable "J" "v8") - (gen-read-variable "v8") -)) + (gen-read-variable "v8"))) (defn event-gen-4 "Generates event and checks states while splitting network" @@ -326,6 +326,64 @@ (gen-status 30) (gen-read-states 10 ["S0","S1","S11"]))) +(defn event-gen-5 + "Generates starts and stops and checks joins" + [] + (gen/phases + (gen-read-states 5 ["S0","S1","S11"]) + (gen-send-event-all "C") + (gen-read-states 5 ["S0","S2","S21","S211"]) + (gen/nemesis + (gen/seq [{:type :info :f :start} + (gen/sleep 5) + {:type :info :f :stop}])) + (gen-read-states 2 ["S0","S2","S21","S211"]) + (gen/nemesis + (gen/seq [{:type :info :f :start} + (gen/sleep 5) + {:type :info :f :stop}])) + (gen-read-states 2 ["S0","S2","S21","S211"]) + (gen/nemesis + (gen/seq [{:type :info :f :start} + (gen/sleep 5) + {:type :info :f :stop}])) + (gen-read-states 2 ["S0","S2","S21","S211"]) + (gen/nemesis + (gen/seq [{:type :info :f :start} + (gen/sleep 5) + {:type :info :f :stop}])) + (gen-read-states 2 ["S0","S2","S21","S211"]) + (gen/nemesis + (gen/seq [{:type :info :f :start} + (gen/sleep 5) + {:type :info :f :stop}])) + (gen-read-states 2 ["S0","S2","S21","S211"]) + (gen/nemesis + (gen/seq [{:type :info :f :start} + (gen/sleep 5) + {:type :info :f :stop}])) + (gen-read-states 2 ["S0","S2","S21","S211"]) + (gen/nemesis + (gen/seq [{:type :info :f :start} + (gen/sleep 5) + {:type :info :f :stop}])) + (gen-read-states 2 ["S0","S2","S21","S211"]) + (gen/nemesis + (gen/seq [{:type :info :f :start} + (gen/sleep 5) + {:type :info :f :stop}])) + (gen-read-states 5 ["S0","S2","S21","S211"]) + (gen-send-event-all "K") + (gen-read-states 5 ["S0","S1","S11"]))) + +(defn killer + "Kills statemachine on a random node on start, restarts it on stop." + [] + (nemesis/node-start-stopper + rand-nth + (fn start [test node] (c/su (c/exec :pkill :-9 :-f :spring-statemachine-samples-web))) + (fn stop [test node] (start! node)))) + (defn statemachine-test "Defaults for testing state machine." [name opts] @@ -374,3 +432,11 @@ {:nemesis (nemesis/partition-random-halves) :generator (event-gen-4) :checker (checker1)})) + +(defn stop-start-test + "Stops and start nodes checking join is okk." + [] + (event-test "partition-half" + {:nemesis (killer) + :generator (event-gen-5) + :checker (checker3)})) diff --git a/jepsen/spring-statemachine-jepsen/test/spring_statemachine_jepsen/core_test.clj b/jepsen/spring-statemachine-jepsen/test/spring_statemachine_jepsen/core_test.clj index 54f122df..9d80efa6 100644 --- a/jepsen/spring-statemachine-jepsen/test/spring_statemachine_jepsen/core_test.clj +++ b/jepsen/spring-statemachine-jepsen/test/spring_statemachine_jepsen/core_test.clj @@ -25,3 +25,6 @@ (deftest partition-half (run-statemachine-test! (partition-half-test))) + +(deftest stop-start + (run-statemachine-test! (stop-start-test)))