Let's start with the well-known Iris dataset used for benchmarking clustering and visualization approaches. It comprises a set of 150 samples from three species of Iris with four morphologic measurements, petal length, petal width, sepal length and sepal width.
A simple scatterplot of two of those measurements, coloured by species might look like this:
irisColors : List ScaleProperty
irisColors =
categoricalDomainMap
[ ( "setosa", "rgb(125,200,125)" )
, ( "versicolor", "rgb(190,175,210)" )
, ( "virginica", "rgb(255,190,130)" )
]
scatter : Spec
scatter =
let
data =
dataFromColumns []
<< dataColumn "sepal length" (Numbers sLength)
<< dataColumn "petal length" (Numbers pLength)
<< dataColumn "species" (Strings species)
enc =
encoding
<< position X [ PName "sepal length", PmType Quantitative ]
<< position Y [ PName "petal length", PmType Quantitative ]
<< color [ MName "species", MmType Nominal, MScale irisColors ]
in
toVegaLite [ data [], mark Circle [], enc [] ]
Because we will be manipulating the data below, we will start by storing the raw Iris data inline in Elm:
sLength : List Float
sLength =
[ 5.1, 4.9, 4.7, 4.6, 5, 5.4, 4.6, 5, 4.4, 4.9, 5.4, 4.8, 4.8, 4.3, 5.8, 5.7, 5.4, 5.1, 5.7, 5.1, 5.4, 5.1, 4.6, 5.1, 4.8, 5, 5, 5.2, 5.2, 4.7, 4.8, 5.4, 5.2, 5.5, 4.9, 5, 5.5, 4.9, 4.4, 5.1, 5, 4.5, 4.4, 5, 5.1, 4.8, 5.1, 4.6, 5.3, 5, 7, 6.4, 6.9, 5.5, 6.5, 5.7, 6.3, 4.9, 6.6, 5.2, 5, 5.9, 6, 6.1, 5.6, 6.7, 5.6, 5.8, 6.2, 5.6, 5.9, 6.1, 6.3, 6.1, 6.4, 6.6, 6.8, 6.7, 6, 5.7, 5.5, 5.5, 5.8, 6, 5.4, 6, 6.7, 6.3, 5.6, 5.5, 5.5, 6.1, 5.8, 5, 5.6, 5.7, 5.7, 6.2, 5.1, 5.7, 6.3, 5.8, 7.1, 6.3, 6.5, 7.6, 4.9, 7.3, 6.7, 7.2, 6.5, 6.4, 6.8, 5.7, 5.8, 6.4, 6.5, 7.7, 7.7, 6, 6.9, 5.6, 7.7, 6.3, 6.7, 7.2, 6.2, 6.1, 6.4, 7.2, 7.4, 7.9, 6.4, 6.3, 6.1, 7.7, 6.3, 6.4, 6, 6.9, 6.7, 6.9, 5.8, 6.8, 6.7, 6.7, 6.3, 6.5, 6.2, 5.9 ]
sWidth : List Float
sWidth =
[ 3.5, 3, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.4, 3, 3, 4, 4.4, 3.9, 3.5, 3.8, 3.8, 3.4, 3.7, 3.6, 3.3, 3.4, 3, 3.4, 3.5, 3.4, 3.2, 3.1, 3.4, 4.1, 4.2, 3.1, 3.2, 3.5, 3.6, 3, 3.4, 3.5, 2.3, 3.2, 3.5, 3.8, 3, 3.8, 3.2, 3.7, 3.3, 3.2, 3.2, 3.1, 2.3, 2.8, 2.8, 3.3, 2.4, 2.9, 2.7, 2, 3, 2.2, 2.9, 2.9, 3.1, 3, 2.7, 2.2, 2.5, 3.2, 2.8, 2.5, 2.8, 2.9, 3, 2.8, 3, 2.9, 2.6, 2.4, 2.4, 2.7, 2.7, 3, 3.4, 3.1, 2.3, 3, 2.5, 2.6, 3, 2.6, 2.3, 2.7, 3, 2.9, 2.9, 2.5, 2.8, 3.3, 2.7, 3, 2.9, 3, 3, 2.5, 2.9, 2.5, 3.6, 3.2, 2.7, 3, 2.5, 2.8, 3.2, 3, 3.8, 2.6, 2.2, 3.2, 2.8, 2.8, 2.7, 3.3, 3.2, 2.8, 3, 2.8, 3, 2.8, 3.8, 2.8, 2.8, 2.6, 3, 3.4, 3.1, 3, 3.1, 3.1, 3.1, 2.7, 3.2, 3.3, 3, 2.5, 3, 3.4, 3 ]
pLength : List Float
pLength =
[ 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5, 1.6, 1.4, 1.1, 1.2, 1.5, 1.3, 1.4, 1.7, 1.5, 1.7, 1.5, 1, 1.7, 1.9, 1.6, 1.6, 1.5, 1.4, 1.6, 1.6, 1.5, 1.5, 1.4, 1.5, 1.2, 1.3, 1.4, 1.3, 1.5, 1.3, 1.3, 1.3, 1.6, 1.9, 1.4, 1.6, 1.4, 1.5, 1.4, 4.7, 4.5, 4.9, 4, 4.6, 4.5, 4.7, 3.3, 4.6, 3.9, 3.5, 4.2, 4, 4.7, 3.6, 4.4, 4.5, 4.1, 4.5, 3.9, 4.8, 4, 4.9, 4.7, 4.3, 4.4, 4.8, 5, 4.5, 3.5, 3.8, 3.7, 3.9, 5.1, 4.5, 4.5, 4.7, 4.4, 4.1, 4, 4.4, 4.6, 4, 3.3, 4.2, 4.2, 4.2, 4.3, 3, 4.1, 6, 5.1, 5.9, 5.6, 5.8, 6.6, 4.5, 6.3, 5.8, 6.1, 5.1, 5.3, 5.5, 5, 5.1, 5.3, 5.5, 6.7, 6.9, 5, 5.7, 4.9, 6.7, 4.9, 5.7, 6, 4.8, 4.9, 5.6, 5.8, 6.1, 6.4, 5.6, 5.1, 5.6, 6.1, 5.6, 5.5, 4.8, 5.4, 5.6, 5.1, 5.1, 5.9, 5.7, 5.2, 5, 5.2, 5.4, 5.1 ]
pWidth : List Float
pWidth =
[ 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2, 0.2, 0.1, 0.1, 0.2, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2, 0.4, 0.2, 0.5, 0.2, 0.2, 0.4, 0.2, 0.2, 0.2, 0.2, 0.4, 0.1, 0.2, 0.2, 0.2, 0.2, 0.1, 0.2, 0.2, 0.3, 0.3, 0.2, 0.6, 0.4, 0.3, 0.2, 0.2, 0.2, 0.2, 1.4, 1.5, 1.5, 1.3, 1.5, 1.3, 1.6, 1, 1.3, 1.4, 1, 1.5, 1, 1.4, 1.3, 1.4, 1.5, 1, 1.5, 1.1, 1.8, 1.3, 1.5, 1.2, 1.3, 1.4, 1.4, 1.7, 1.5, 1, 1.1, 1, 1.2, 1.6, 1.5, 1.6, 1.5, 1.3, 1.3, 1.3, 1.2, 1.4, 1.2, 1, 1.3, 1.2, 1.3, 1.3, 1.1, 1.3, 2.5, 1.9, 2.1, 1.8, 2.2, 2.1, 1.7, 1.8, 1.8, 2.5, 2, 1.9, 2.1, 2, 2.4, 2.3, 1.8, 2.2, 2.3, 1.5, 2.3, 2, 2, 1.8, 2.1, 1.8, 1.8, 1.8, 2.1, 1.6, 1.9, 2, 2.2, 1.5, 1.4, 2.3, 2.4, 1.8, 1.8, 2.1, 2.4, 2.3, 1.9, 2.3, 2.5, 2.3, 1.9, 2, 2.3, 1.8 ]
species : List String
species =
[ "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "versicolor", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica" ]
To compare all possible mophologic pairs as scatterplots we can create a scatterplot matrix (SPLOM):
splom : Spec
splom =
let
data =
dataFromColumns []
<< dataColumn "petal length" (Numbers pLength)
<< dataColumn "petal width" (Numbers pWidth)
<< dataColumn "sepal length" (Numbers sLength)
<< dataColumn "sepal width" (Numbers sWidth)
<< dataColumn "species" (Strings species)
enc =
encoding
<< position X [ PRepeat Column, PmType Quantitative ]
<< position Y [ PRepeat Row, PmType Quantitative ]
<< color [ MName "species", MmType Nominal, MScale irisColors ]
spec =
asSpec [ width 120, height 120, data [], mark Circle [], enc [] ]
in
toVegaLite
[ repeat
[ RowFields [ "sepal length", "sepal width", "petal length", "petal width" ]
, ColumnFields [ "sepal length", "sepal width", "petal length", "petal width" ]
]
, specification spec
]
The SPLOM ensures every permutation of pairs is displayed and even the scatterplot of a measurement against itself (diagonal) has value in showing the distribution of every measurement. It does though take up quite some space and there is redundancy both in the data being shown (the top-right is a mirror of the bottom left) and the visualization decorations (axes, tick marks, titles etc.).
A parallel coordinates plot (PCP) can be thought of as a set of line graphs where the x-axis is an ordered sequence of the variables to show and the y-axis is the magnitude of each of those variables. It has the advantage of being able to show a large number of variables in a small space. To create a PCP we need to reshape our data from:
variable A | variable B | variable C |
---|---|---|
a1 | b1 | c1 |
a2 | b2 | c2 |
a3 | b3 | c3 |
a4 | b4 | c4 |
to
Group | Variable | Value |
---|---|---|
1 | a | a1 |
1 | b | b1 |
1 | c | c1 |
2 | a | a2 |
2 | b | b2 |
2 | c | c2 |
3 | a | a3 |
3 | b | b3 |
3 | c | c3 |
4 | a | a4 |
4 | b | b4 |
4 | c | c4 |
This kind of list reshaping is something well suited to functional programming, so can be managed easily in the 'elm' bit of elm-vega:
toPCData : List (List a) -> ( List Float, List Float, List a )
toPCData xss =
let
( rows, xs ) =
xss
|> transpose
|> List.indexedMap (\i xs -> ( List.repeat (List.length xss) (toFloat i), xs ))
|> List.unzip
vars =
List.repeat (List.length rows) (List.range 1 (List.length xss) |> List.map toFloat)
in
( List.concat rows, List.concat vars, List.concat xs )
spread : Int -> List a -> List a
spread n =
List.map (\x -> List.repeat n x) >> List.concat
{-| Transposes a list of lists, swappings rows for columns.
-}
transpose : List (List a) -> List (List a)
transpose ll =
let
heads =
List.filterMap List.head ll
tails =
List.filterMap List.tail ll
in
if List.length heads == List.length ll then
heads :: transpose tails
else
[]
If we then encode the variable
with x-position and value
with y-position and break lines by group
(with Vega-Lite's detail
encoding), we get:
pc1 : Spec
pc1 =
let
table =
[ sLength, sWidth, pLength, pWidth ]
( groups, vars, xs ) =
toPCData table
data =
dataFromColumns []
<< dataColumn "morphology" (Numbers xs)
<< dataColumn "groups" (Numbers groups)
<< dataColumn "vars" (Numbers vars)
<< dataColumn "species" (Strings (spread (List.length table) species))
enc =
encoding
<< position X [ PName "vars", PmType Ordinal ]
<< position Y [ PName "morphology", PmType Quantitative ]
<< detail [ DName "groups", DmType Nominal ]
<< color [ MName "species", MmType Nominal, MScale irisColors ]
in
toVegaLite [ width 300, data [], mark Line [ MStrokeWidth 0.3 ], enc [] ]
While a pretty good approximation of a parallel coordinates plot, it does have several problems. Firstly, the variables are shown on the x-axis by their variable ordering number (1-4) by default rather than as labelled vertical axes. More significantly, all variables are scaled to the same range (0-8) and assumed to be of the same type (here they are all lengths but in many PCPs, they can be a range of different data types and measurement scales).
To overcome these problems we can use elm to rescale all variables to the range [0-1], hide all axes and then create a separate 'axis-annotation layer' for the four morphology variables:
normalize : List Float -> List Float
normalize xs =
let
minX =
List.minimum xs |> Maybe.withDefault 0
maxX =
List.maximum xs |> Maybe.withDefault 1
in
List.map (\x -> (x - minX) / (maxX - minX)) xs
pc2 : Spec
pc2 =
let
table =
[ normalize sLength, normalize sWidth, normalize pLength, normalize pWidth ]
( groups, vars, xs ) =
toPCData table
varLabel v =
case v of
1 ->
"sepal length"
2 ->
"sepal width"
3 ->
"petal length"
4 ->
"petal width"
_ ->
"unknown"
data =
dataFromColumns []
<< dataColumn "morphology" (Numbers xs)
<< dataColumn "groups" (Numbers groups)
<< dataColumn "vars" (Strings (List.map varLabel vars))
<< dataColumn "species" (Strings (spread (List.length table) species))
config =
configure
<< configuration (View [ Stroke Nothing ])
enc =
encoding
<< position X [ PName "vars", PmType Ordinal, PAxis [ AxDomain False, AxGrid False ] ]
<< position Y [ PName "morphology", PmType Quantitative, PAxis [] ]
<< detail [ DName "groups", DmType Nominal ]
<< color [ MName "species", MmType Nominal, MScale irisColors ]
pcSpec =
asSpec [ width 300, mark Line [ MStrokeWidth 0.3 ], enc [] ]
axisEnc =
encoding
<< position X [ PName "vars", PmType Ordinal, PAxis [ AxTitle "" ] ]
axisSpec =
asSpec [ width 300, mark Rule [], axisEnc [] ]
in
toVegaLite [ config [], data [], layer [ pcSpec, axisSpec ] ]
Finally, the rather lengthy inline data generated via elm can be placed in its own file to keep the Vega-Lite specification clean and readable.
irisColors : List ScaleProperty
irisColors =
categoricalDomainMap
[ ( "setosa", "rgb(125,200,125)" )
, ( "versicolor", "rgb(190,175,210)" )
, ( "virginica", "rgb(255,190,130)" )
]
pcFinal : Spec
pcFinal =
let
config =
configure
<< configuration (View [ Stroke Nothing ])
data =
dataFromUrl "data/irispc.json"
pcEnc =
encoding
<< position X [ PName "vars", PmType Ordinal, PAxis [ AxDomain False, AxGrid False ] ]
<< position Y [ PName "morphology", PmType Quantitative, PAxis [] ]
<< detail [ DName "groups", DmType Nominal ]
<< color [ MName "species", MmType Nominal, MScale irisColors ]
pcSpec =
asSpec [ width 300, mark Line [ MStrokeWidth 0.3 ], pcEnc [] ]
axisEnc =
encoding
<< position X [ PName "vars", PmType Nominal, PAxis [ AxTitle "" ] ]
axisSpec =
asSpec [ width 300, mark Rule [], axisEnc [] ]
in
toVegaLite [ config [], data [], layer [ pcSpec, axisSpec ] ]