Introduction

This file contains some examples of the functions related to the DDC routine. More specifically, DDC and cellMap will be illustrated.

library("cellWise")
library("gridExtra") # has grid.arrange()

# Default options for DDC:
DDCpars = list(fracNA = 0.5, numDiscrete = 3, precScale = 1e-12,
               cleanNAfirst = "automatic", tolProb = 0.99, 
               corrlim = 0.5, combinRule = "wmean",
               returnBigXimp = FALSE, silent = FALSE,
               nLocScale = 25000, fastDDC = FALSE,
               standType = "1stepM", corrType = "gkwls",
               transFun = "wrap", nbngbrs = 100)

# A small list giving the same results:
DDCpars = list(fastDDC = FALSE)

Example with row and column selection

First we illustrate the selection of columns and rows by DDC. This is actually done by the function checkDataSet which is called by DDC.

i = c(1,2,3,4,5,6,7,8,9) 
name = c("aa","bb","cc","dd","ee","ff","gg","hh","ii") 
logic = c(TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE) 
V1 = c(1.3,NaN,4.5,2.7,20.0,4.4,-2.1,1.1,-5)
V2 = c(2.3,NA,5,6,7,8,4,-10,0.5)
V3 = c(2,Inf,3,-4,5,6,7,-2,8)
Vna = c(1,-4,2,NaN,3,-Inf,NA,6,5)
Vdis = c(1,1,2,2,3,3,3,1,2)
V0s = c(1,1.5,2,2,2,2,2,3,2.5) 
datafr = data.frame(i,name,logic,V1,V2,V3,Vna,Vdis,V0s) 
datafr
##   i name logic   V1    V2  V3  Vna Vdis V0s
## 1 1   aa  TRUE  1.3   2.3   2    1    1 1.0
## 2 2   bb FALSE  NaN    NA Inf   -4    1 1.5
## 3 3   cc  TRUE  4.5   5.0   3    2    2 2.0
## 4 4   dd FALSE  2.7   6.0  -4  NaN    2 2.0
## 5 5   ee FALSE 20.0   7.0   5    3    3 2.0
## 6 6   ff  TRUE  4.4   8.0   6 -Inf    3 2.0
## 7 7   gg  TRUE -2.1   4.0   7   NA    3 2.0
## 8 8   hh  TRUE  1.1 -10.0  -2    6    1 3.0
## 9 9   ii FALSE -5.0   0.5   8    5    2 2.5
DDCdatafr = DDC(datafr,DDCpars)
##  
##  The input data has 9 rows and 9 columns.
##  
##  The input data contained 2 non-numeric columns (variables).
##  Their column names are:
##  
## [1] name  logic
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 7 numeric columns:
##  
## [1] i    V1   V2   V3   Vna  Vdis V0s 
##  
##  The data contained 1 columns that were identical to the case number
##  (number of the row).
##  Their column names are:
##  
## [1] i
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 6 columns:
##  
## [1] V1   V2   V3   Vna  Vdis V0s 
##  
##  The data contained 1 discrete columns with 3 or fewer values.
##  Their column names are:
##  
## [1] Vdis
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 5 columns:
##  
## [1] V1  V2  V3  Vna V0s
##  
##  The data contained 1 columns with zero or tiny median absolute deviation.
##  Their column names are:
##  
## [1] V0s
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 4 columns:
##  
## [1] V1  V2  V3  Vna
##  
##  The final data set we will analyze has 9 rows and 4 columns.
## 
remX = DDCdatafr$remX; dim(remX)
## [1] 9 4
cellMap(D=remX, R=DDCdatafr$stdResid, rowlabels = 1:nrow(remX), 
        columnlabels = colnames(remX))

plot of chunk unnamed-chunk-3

# Red cells have higher value than predicted, blue cells lower,
# white cells are missing values, all other cells are yellow.

Small generated dataset

set.seed(12345) # for reproducibility
n <- 50; d <- 20
A <- matrix(0.9, d, d); diag(A) = 1
round(A[1:10,1:10],1) # true covariance matrix
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]  1.0  0.9  0.9  0.9  0.9  0.9  0.9  0.9  0.9   0.9
##  [2,]  0.9  1.0  0.9  0.9  0.9  0.9  0.9  0.9  0.9   0.9
##  [3,]  0.9  0.9  1.0  0.9  0.9  0.9  0.9  0.9  0.9   0.9
##  [4,]  0.9  0.9  0.9  1.0  0.9  0.9  0.9  0.9  0.9   0.9
##  [5,]  0.9  0.9  0.9  0.9  1.0  0.9  0.9  0.9  0.9   0.9
##  [6,]  0.9  0.9  0.9  0.9  0.9  1.0  0.9  0.9  0.9   0.9
##  [7,]  0.9  0.9  0.9  0.9  0.9  0.9  1.0  0.9  0.9   0.9
##  [8,]  0.9  0.9  0.9  0.9  0.9  0.9  0.9  1.0  0.9   0.9
##  [9,]  0.9  0.9  0.9  0.9  0.9  0.9  0.9  0.9  1.0   0.9
## [10,]  0.9  0.9  0.9  0.9  0.9  0.9  0.9  0.9  0.9   1.0
library(MASS) # only needed for the following line:
x <- mvrnorm(n, rep(0,d), A)
x[sample(1:(n * d), 50, FALSE)] <- NA
x[sample(1:(n * d), 50, FALSE)] <- 10
x[sample(1:(n * d), 50, FALSE)] <- -10

# When not specifying DDCpars in the call to DDC
# all defaults are used:
DDCx <- DDC(x)
##  
##  The input data has 50 rows and 20 columns.
cellMap(D=DDCx$remX, R=DDCx$stdResid, columnlabels = 1:d, 
        rowlabels = 1:n)

plot of chunk unnamed-chunk-4

# Red cells have higher value than predicted, blue cells lower,
# white cells are missing values, all other cells are yellow.
DDCx$DDCpars # These are the default options:
## $fracNA
## [1] 0.5
## 
## $numDiscrete
## [1] 3
## 
## $precScale
## [1] 1e-12
## 
## $cleanNAfirst
## [1] "automatic"
## 
## $tolProb
## [1] 0.99
## 
## $corrlim
## [1] 0.5
## 
## $combinRule
## [1] "wmean"
## 
## $returnBigXimp
## [1] FALSE
## 
## $silent
## [1] FALSE
## 
## $nLocScale
## [1] 25000
## 
## $fastDDC
## [1] FALSE
## 
## $standType
## [1] "1stepM"
## 
## $corrType
## [1] "gkwls"
## 
## $transFun
## [1] "wrap"
## 
## $nbngbrs
## [1] 100
names(DDCx)
##  [1] "DDCpars"         "colInAnalysis"   "rowInAnalysis"  
##  [4] "namesNotNumeric" "namesCaseNumber" "namesNAcol"     
##  [7] "namesNArow"      "namesDiscrete"   "namesZeroScale" 
## [10] "remX"            "locX"            "scaleX"         
## [13] "Z"               "nbngbrs"         "ngbrs"          
## [16] "robcors"         "robslopes"       "deshrinkage"    
## [19] "Xest"            "scalestres"      "stdResid"       
## [22] "indcells"        "Ti"              "medTi"          
## [25] "madTi"           "indrows"         "indall"         
## [28] "indNAs"          "Ximp"
# We will now go through these outputs one by one:

DDCx$colInAnalysis # all columns X1,...,X20 remain:
##  X1  X2  X3  X4  X5  X6  X7  X8  X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
## X19 X20 
##  19  20
DDCx$rowInAnalysis # all rows 1,...,50 remain:
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
DDCx$namesNotNumeric # no non-numeric columns:
## NULL
DDCx$namesCaseNumber # no column was the case number:
## NULL
DDCx$namesNAcol # no columns with too many NA's:
## NULL
DDCx$namesNArow # no columns with too many NA's:
## NULL
DDCx$namesDiscrete # no discrete columns:
## NULL
DDCx$namesZeroScale # no columns with scale = 0:
## NULL
dim(DDCx$remX) # remaining data matrix
## [1] 50 20
round(DDCx$locX,2) # robust location estimates of all 20 columns:
##    X1    X2    X3    X4    X5    X6    X7    X8    X9   X10   X11   X12 
## -0.15 -0.23 -0.18 -0.31 -0.17 -0.25 -0.63 -0.33 -0.40 -0.23 -0.31 -0.32 
##   X13   X14   X15   X16   X17   X18   X19   X20 
## -0.20 -0.28 -0.35 -0.23 -0.26 -0.23 -0.33 -0.37
round(DDCx$scaleX,2) # robust scale estimates of all 20 columns:
##   X1   X2   X3   X4   X5   X6   X7   X8   X9  X10  X11  X12  X13  X14  X15 
## 1.33 1.54 1.47 1.21 1.33 1.29 1.21 1.25 1.21 1.38 1.43 1.26 1.06 1.38 1.24 
##  X16  X17  X18  X19  X20 
## 1.27 1.50 1.20 1.12 1.31
round(DDCx$Z[1:10,1:10],1)
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,] -0.5 -0.6 -0.3 -0.2  0.0 -0.1 -0.2 -0.6  0.2   0.2
##  [2,]  0.0 -0.1 -0.5   NA -0.7 -0.1 -0.2 -7.7 -7.9  -0.2
##  [3,]  0.2  0.1  0.3  0.5  0.2  7.9  0.4   NA  0.6   7.4
##  [4,]  0.5  0.4  0.1  0.2  0.6 -7.5  0.7  0.6  0.7   7.4
##  [5,] -0.6 -0.2 -0.2 -0.1 -0.2 -0.2  0.6 -0.5 -7.9  -0.6
##  [6,]  1.6  1.3  1.2  1.7  1.2  1.6  2.3  1.3  1.6   1.7
##  [7,] -0.2 -0.3 -6.7 -0.5 -0.7 -0.3 -0.1 -0.5 -7.9   0.0
##  [8,] -7.4  0.2  0.7  0.3  0.6  0.5   NA -7.7  0.5   0.1
##  [9,]   NA  0.1  0.2  0.7  0.5  0.6  0.4  0.2  0.0   0.4
## [10,]  0.3  0.6  0.7  0.6  1.1  0.6  1.4 -7.7  1.7   7.4
# Robustly standardized dataset. Due to the high correlations,
# cells in the same row look similar (except for outlying cells).

DDCx$nbngbrs
## [1] 19
# For each column the code looked for up to 19 non-self neighbors (highly correlated columns).
# It goes through all of them, unless fastDDC is set to TRUE.

DDCx$ngbrs[1:3,]
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## [1,]    1   11    2   14   12   13   17   16   18    10     4     8    19
## [2,]    2   11   17   16   13   15   12    1    8     4    10    20    14
## [3,]    3   13   20    5   14   16    6   11    9    17     4     2    12
##      [,14] [,15] [,16] [,17] [,18] [,19] [,20]
## [1,]    15    20     6     3     9     5     7
## [2,]     9     3     5    19     6    18     7
## [3,]    19    10     8    15     1    18     7
# Shows the neighbors, e.g. the nearest non-self neighbor of X1 is X11, then X2,...

round(DDCx$robcors[1:3,],2)
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## [1,]    1 0.93 0.93 0.93 0.93 0.92 0.91 0.91 0.90  0.90  0.90  0.90  0.90
## [2,]    1 0.94 0.94 0.93 0.93 0.93 0.93 0.93 0.93  0.92  0.92  0.91  0.91
## [3,]    1 0.95 0.94 0.94 0.93 0.93 0.92 0.91 0.91  0.91  0.91  0.91  0.90
##      [,14] [,15] [,16] [,17] [,18] [,19] [,20]
## [1,]  0.89  0.88  0.88  0.87  0.87  0.87  0.84
## [2,]  0.91  0.91  0.90  0.90  0.90  0.89  0.88
## [3,]  0.89  0.89  0.89  0.89  0.87  0.87  0.86
# Robust correlations with these neighbors. In each row the correlations
# are sorted by decreasing absolute value. 

round(DDCx$robslopes[1:3,],2)
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## [1,]    1 0.91 1.02 0.96 0.85 0.75 0.97 0.83 0.78  0.92  0.83  0.86  0.85
## [2,]    1 0.87 0.87 0.76 0.72 0.73 0.76 0.84 0.78  0.70  0.85  0.84  0.83
## [3,]    1 0.70 0.80 0.81 0.82 0.72 0.78 0.84 0.73  0.82  0.69  0.87  0.73
##      [,14] [,15] [,16] [,17] [,18] [,19] [,20]
## [1,]  0.81  0.87  0.83  0.97  0.85  0.80  0.70
## [2,]  0.73  0.94  0.76  0.74  0.72  0.71  0.67
## [3,]  0.69  0.79  0.72  0.69  0.79  0.68  0.65
# For each column, the slope of each neighbor predicting it.
# For instance, X1 is predicted by its first neighbor with 
# slope 0.91 and by its second neighbor with slope 1.02 .

round(DDCx$deshrinkage,2)
##   X1   X2   X3   X4   X5   X6   X7   X8   X9  X10  X11  X12  X13  X14  X15 
## 1.09 1.09 1.09 1.07 1.08 1.09 1.14 1.07 1.08 1.07 1.09 1.09 1.10 1.08 1.09 
##  X16  X17  X18  X19  X20 
## 1.11 1.09 1.09 1.12 1.10
# For each column, the factor by which its prediction is multiplied.
round(DDCx$Xest[1:12,1:10],2) # the estimated cells of remX:
##        [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]  [,8]  [,9] [,10]
##  [1,] -0.45 -0.55 -0.47 -0.61 -0.47 -0.56 -0.95 -0.62 -0.68 -0.53
##  [2,] -0.51 -0.63 -0.56 -0.70 -0.57 -0.64 -1.04 -0.70 -0.77 -0.61
##  [3,]  0.24  0.19  0.20  0.09  0.24  0.17 -0.19  0.06 -0.01  0.17
##  [4,]  0.53  0.48  0.46  0.38  0.53  0.46  0.10  0.34  0.26  0.45
##  [5,] -0.46 -0.54 -0.47 -0.61 -0.48 -0.54 -0.91 -0.62 -0.68 -0.54
##  [6,]  1.81  1.85  1.72  1.71  1.86  1.84  1.54  1.61  1.50  1.81
##  [7,] -0.46 -0.56 -0.49 -0.63 -0.50 -0.58 -0.95 -0.65 -0.70 -0.55
##  [8,]  0.35  0.30  0.32  0.21  0.37  0.30 -0.09  0.17  0.09  0.28
##  [9,]  0.36  0.30  0.30  0.21  0.36  0.29 -0.08  0.16  0.08  0.29
## [10,]  0.93  0.94  0.90  0.82  0.99  0.93  0.60  0.77  0.70  0.90
## [11,]  0.23  0.17  0.16  0.07  0.21  0.14 -0.23  0.04 -0.04  0.15
## [12,] -1.54 -1.71 -1.56 -1.75 -1.64 -1.73 -2.15 -1.70 -1.74 -1.67
round(DDCx$stdResid[1:12,1:10],1)
##        [,1] [,2]  [,3] [,4] [,5]  [,6] [,7]  [,8]  [,9] [,10]
##  [1,]  -0.8 -1.7  -0.3  0.2  0.6   0.6  0.2  -1.1   1.6   1.2
##  [2,]   0.8  0.6  -0.7   NA -1.3   0.7  0.4 -24.4 -28.7   0.2
##  [3,]  -0.3 -0.5   0.2  0.6 -0.3  26.7  0.2    NA   1.2  21.5
##  [4,]  -0.1 -0.3  -1.1 -1.3  0.4 -28.4  0.2   0.3   0.7  20.9
##  [5,]  -1.2  0.1  -0.2  0.5  0.1   0.1  2.6  -0.7 -29.0  -1.1
##  [6,]   0.3 -0.4  -0.3  0.0 -1.0   0.0  1.5  -0.8   0.1   0.6
##  [7,]   0.2 -0.3 -21.0 -0.9 -1.6  -0.2  0.5  -0.8 -28.9   0.7
##  [8,] -23.9 -0.4   1.0 -0.4  0.6   0.3   NA -26.7   0.5  -0.9
##  [9,]    NA -0.8  -0.4  1.1  0.2   0.7 -0.1  -0.7  -1.4   0.0
## [10,]  -1.5 -0.4  -0.1 -1.3  0.9  -1.0  1.1 -28.3   2.9  19.9
## [11,]   0.6  0.4  -1.2  0.0 -0.6 -27.5  0.6   1.1  -0.4  21.6
## [12,]  -0.1 -0.3  -1.2 -0.1 -1.2  -1.0  1.0   2.1   1.1   0.3
# The standardized residuals of the cells. Note the NA's and some
# large positive and negative cell residuals.

qqnorm(as.vector(DDCx$stdResid)) # Note the far outliers on both sides:

plot of chunk unnamed-chunk-6

as.vector(DDCx$indcells) # indices of the cells that were flagged by DDC:
##   [1]    8   25   28   32   47   64   74   75   80   85   89   90   91  107
##  [15]  113  116  121  129  137  141  177  179  184  187  191  197  215  228
##  [29]  229  233  238  253  254  261  269  282  290  293  300  305  316  324
##  [43]  352  358  360  377  384  402  405  407  410  453  454  460  461  478
##  [57]  501  504  505  513  575  578  583  597  604  629  639  655  660  676
##  [71]  677  697  740  750  757  775  776  781  789  791  800  801  813  814
##  [85]  827  838  846  855  856  902  910  938  954  972  975  976  985  989
##  [99]  994 1000
plot(DDCx$Ti) # the Ti values of the rows. None are high.

plot of chunk unnamed-chunk-6

qqnorm(DDCx$Ti) # no outliers

plot of chunk unnamed-chunk-6

DDCx$medTi # median of the raw Ti (used to standardize Ti):
## [1] -0.000131909
DDCx$madTi # median absolute deviation of the raw Ti (used to standardize Ti):
## [1] 0.08437571
DDCx$indrows # numeric(0) means no rows are flagged:
## numeric(0)
as.vector(DDCx$indall) # indices of the flagged cells, including those in flagged rows:
##   [1]    8   25   28   32   47   64   74   75   80   85   89   90   91  107
##  [15]  113  116  121  129  137  141  177  179  184  187  191  197  215  228
##  [29]  229  233  238  253  254  261  269  282  290  293  300  305  316  324
##  [43]  352  358  360  377  384  402  405  407  410  453  454  460  461  478
##  [57]  501  504  505  513  575  578  583  597  604  629  639  655  660  676
##  [71]  677  697  740  750  757  775  776  781  789  791  800  801  813  814
##  [85]  827  838  846  855  856  902  910  938  954  972  975  976  985  989
##  [99]  994 1000
as.vector(DDCx$indNAs) # indices of the missing cells:
##  [1]   9  29  39  40  41  83  99 148 150 152 165 196 217 296 308 349 353
## [18] 369 376 383 444 466 471 531 587 589 609 622 678 680 684 737 743 749
## [35] 793 805 821 866 880 890 905 924 958 963 980
round(DDCx$Ximp[1:10,1:10],2)
##        [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]  [,8]  [,9] [,10]
##  [1,] -0.81 -1.21 -0.59 -0.56 -0.23 -0.33 -0.86 -1.05 -0.15  0.01
##  [2,] -0.15 -0.38 -0.87 -0.70 -1.07 -0.37 -0.89 -0.70 -0.77 -0.53
##  [3,]  0.12 -0.01  0.31  0.28  0.13  0.17 -0.12  0.06  0.37  0.17
##  [4,]  0.48  0.36 -0.02 -0.08  0.68  0.46  0.18  0.45  0.50  0.45
##  [5,] -0.97 -0.50 -0.55 -0.45 -0.45 -0.51 -0.91 -0.90 -0.68 -1.05
##  [6,]  1.94  1.71  1.59  1.70  1.47  1.85  2.15  1.32  1.54  2.08
##  [7,] -0.39 -0.67 -0.49 -0.94 -1.11 -0.64 -0.77 -0.94 -0.70 -0.25
##  [8,]  0.35  0.15  0.78  0.08  0.60  0.39 -0.09  0.17  0.25 -0.13
##  [9,]  0.36 -0.02  0.10  0.57  0.44  0.55 -0.14 -0.09 -0.36  0.26
## [10,]  0.28  0.77  0.87  0.38  1.32  0.55  1.06  0.77  0.70  0.90
# The imputed matrix. Both the cellwise outliers and the missing values
# are replaced by their predicted values.

round((DDCx$Ximp - DDCx$remX)[1:10,1:10],2)
##       X1 X2   X3 X4 X5    X6    X7    X8    X9   X10
## 1   0.00  0 0.00  0  0  0.00  0.00  0.00  0.00  0.00
## 2   0.00  0 0.00 NA  0  0.00  0.00  9.30  9.23  0.00
## 3   0.00  0 0.00  0  0 -9.83  0.00    NA  0.00 -9.83
## 4   0.00  0 0.00  0  0 10.46  0.00  0.00  0.00 -9.55
## 5   0.00  0 0.00  0  0  0.00 -1.05  0.00  9.32  0.00
## 6   0.00  0 0.00  0  0  0.00  0.00  0.00  0.00  0.00
## 7   0.00  0 9.51  0  0  0.00  0.00  0.00  9.30  0.00
## 8  10.35  0 0.00  0  0  0.00    NA 10.17  0.00  0.00
## 9     NA  0 0.00  0  0  0.00  0.00  0.00  0.00  0.00
## 10  0.00  0 0.00  0  0  0.00  0.00 10.77 -0.93 -9.10
# The nonzero values and the NA's correspond to imputed cells.

TopGear dataset

The Top Gear data contains information on 297 cars.

library(robustHD)
data(TopGear)
dim(TopGear)
## [1] 297  32
rownames(TopGear)[1:13] # "1" to "297" are not useful names
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13"
rownames(TopGear) = paste(TopGear[,1],TopGear[,2]) 
# Now the rownames are make and model of the cars.
rownames(TopGear)[165] = "Mercedes-Benz G" # name was too long
myTopGear = TopGear[,-31] # removes the subjective variable `Verdict'

# Transform some variables to get roughly gaussianity in the center:
transTG = myTopGear
transTG$Price = log(myTopGear$Price)
transTG$Displacement = log(myTopGear$Displacement)
transTG$BHP = log(myTopGear$BHP)
transTG$Torque = log(myTopGear$Torque)
transTG$TopSpeed = log(myTopGear$TopSpeed)

# Run the DDC method:
DDCpars = list(fastDDC = FALSE, silent = TRUE)
DDCtransTG = DDC(transTG,DDCpars)
##  
##  The final data set we will analyze has 296 rows and 11 columns.
## 
# With DDCpars = list(fastDDC = FALSE, silent = FALSE) we obtain more information:
# 
# The input data has 297 rows and 31 columns.
# 
# The input data contained 19 non-numeric columns (variables).
# Their column names are:
#   
#  [1] Maker         Model              Type               Fuel              
#  [5] DriveWheel    AdaptiveHeadlights AdjustableSteering AlarmSystem       
#  [9] Automatic     Bluetooth          ClimateControl     CruiseControl     
# [13] ElectricSeats Leather            ParkingSensors     PowerSteering     
# [17] SatNav        ESP                Origin            
# 
# These columns will be ignored in the analysis.
# We continue with the remaining 12 numeric columns:
#   
# [1] Price Cylinders Displacement BHP   Torque Acceleration TopSpeed    
# [8] MPG   Weight    Length       Width Height      
# 
# The data contained 1 rows with over 50% of NAs.
# Their row names are:
#   
#   [1] Citroen C5 Tourer
# 
# These rows will be ignored in the analysis.
# We continue with the remaining 296 rows:
#   
#   [1] Alfa Romeo Giulietta             Alfa Romeo MiTo                 
# .......
# [295] Volvo XC70                       Volvo XC90                      
# 
# The data contained 1 columns with zero or tiny median absolute deviation.
# Their column names are:
# 
# [1] Cylinders
# 
# These columns will be ignored in the analysis.
# We continue with the remaining 11 columns:
# 
# [1] Price  Displacement BHP   Torque Acceleration TopSpeed MPG         
# [8] Weight Length       Width Height      
# 
# The final data set we will analyze has 296 rows and 11 columns.
remX = DDCtransTG$remX # the remaining part of the dataset
dim(remX)
## [1] 296  11
colSums(is.na(remX)) # There are still NAs, mainly in `Weight':
##        Price Displacement          BHP       Torque Acceleration 
##            0            8            3            3            0 
##     TopSpeed          MPG       Weight       Length        Width 
##            3           11           32           10           15 
##       Height 
##           10
# Analyze the data by column:
standX = scale(remX,apply(remX,2,median,na.rm = TRUE), 
               apply(remX,2,mad,na.rm = TRUE))
dim(standX)
## [1] 296  11
round(standX[1:5,],1) # has NAs where remX does
##                          Price Displacement  BHP Torque Acceleration
## Alfa Romeo Giulietta      -0.4         -0.4 -0.7    0.0          0.6
## Alfa Romeo MiTo           -0.9         -0.7 -0.7   -1.6          0.4
## Aston Martin Cygnet        0.3         -0.8 -0.8   -1.7          0.7
## Aston Martin DB9           2.7          2.1  1.9    1.2         -1.2
## Aston Martin DB9 Volante   2.8          2.1  1.9    1.2         -1.2
##                          TopSpeed  MPG Weight Length Width Height
## Alfa Romeo Giulietta         -0.5  1.0   -0.3   -0.3  -0.2   -0.2
## Alfa Romeo MiTo              -0.4  0.1   -1.0   -0.9  -1.1   -0.3
## Aston Martin Cygnet          -0.9  0.6   -1.3   -3.1  -1.5    0.1
## Aston Martin DB9              2.0 -1.7    0.8    0.6    NA   -1.6
## Aston Martin DB9 Volante      2.0 -1.7    1.0    0.6    NA   -1.6
transTGcol = remX
transTGcol[abs(standX) > sqrt(qchisq(0.99,1))] = NA
round(transTGcol[1:5,],1) # has NAs in outlying cells as well:
##                          Price Displacement BHP Torque Acceleration
## Alfa Romeo Giulietta      10.0          7.4 4.7    5.5         11.3
## Alfa Romeo MiTo            9.6          7.2 4.7    4.6         10.7
## Aston Martin Cygnet       10.3          7.2 4.6    4.5         11.8
## Aston Martin DB9            NA          8.7 6.2    6.1          4.6
## Aston Martin DB9 Volante    NA          8.7 6.2    6.1          4.6
##                          TopSpeed MPG Weight Length Width Height
## Alfa Romeo Giulietta          4.7  64   1385   4351  1798   1465
## Alfa Romeo MiTo               4.8  49   1090   4063  1720   1446
## Aston Martin Cygnet           4.7  56    988     NA  1680   1500
## Aston Martin DB9              5.2  19   1785   4720    NA   1282
## Aston Martin DB9 Volante      5.2  19   1890   4720    NA   1282
# Make untransformed submatrix of X for labeling the cells in the plot:
tempX = myTopGear[DDCtransTG$rowInAnalysis,DDCtransTG$colInAnalysis]
tempX$Price = tempX$Price/1000 # to avoid printing long numbers
dim(tempX)
## [1] 296  11
columnlabels = colnames(tempX)
rowlabels = rownames(tempX)
# Show the following 17 cars in the cellmap:
showrows = c(12,42,56,73,81,94,99,135,150,164,176,198,209,215,234,241,277)

# Make two ggplot2 objects:
ggpcol = cellMap(D=tempX,
                 R=standX,
                 indcells=which(is.na(transTGcol)),
                 indrows=integer(0),
                 columnlabels=columnlabels,
                 rowlabels=rowlabels,
                 mTitle="By column",
                 showrows=showrows,
                 showVals="D",
                 adjustrowlabels=0.5) 
plot(ggpcol)

plot of chunk unnamed-chunk-9

ggpDDC = cellMap(D=tempX,
                 R=DDCtransTG$stdResid, 
                 indcells=DDCtransTG$indcells,
                 indrows=DDCtransTG$indrows,
                 columnlabels=columnlabels,
                 rowlabels=rowlabels,
                 mTitle="DetectDeviatingCells",
                 showrows=showrows,
                 showVals="D",
                 adjustrowlabels=0.5)
plot(ggpDDC)

plot of chunk unnamed-chunk-10

# Creating the pdf:
# pdf("cellMap_TopGear.pdf", width = 20, height = 15 )
# gridExtra::grid.arrange(ggpcol,ggpDDC,nrow=1) # combines 2 plots in a figure
# dev.off()

Analyzing new data by DDCpredict

We now consider the 17 cars shown in the cellmap as a new' dataset.

# Top Gear dataset: prediction of "new" data
############################################
# For comparison we first remake the cell map of the entire dataset, but now 
# showing the values of the residuals instead of the data values:

dim(remX) # 296 11
## [1] 296  11
rowlabels = rownames(remX)
columnlabels = colnames(remX)

ggpDDC = cellMap(D=remX,
                 R=DDCtransTG$stdResid,
                 indcells=DDCtransTG$indcells,
                 indrows=DDCtransTG$indrows,
                 standOD=NULL,
                 showVals="R",
                 rowlabels=rowlabels,
                 columnlabels=columnlabels,
                 mTitle="DDC",
                 showrows=showrows, 
                 adjustrowlabels=0.5, 
                 outlyingGrad = 1)
plot(ggpDDC)

plot of chunk unnamed-chunk-11

Define the “initial” dataset as the rows not in these 17:

initX = remX[-showrows,]
dim(initX) # 279 11
## [1] 279  11
# Fit initX:
DDCinitX = DDC(initX,DDCpars=DDCpars) 
##  
##  The final data set we will analyze has 278 rows and 11 columns.
## 

Define the “new” dataset, and apply DDCpredict to it:

newX = remX[showrows,]
dim(newX) # 17  11 
## [1] 17 11
# Make predictions by DDCpredict. 
# Its inputs are:
#   Xnew       : the new data (test data)
#   InitialDDC : Must be provided.
#   DDCpars    : the input options to be used for the prediction.
#                By default the options of InitialDDC are used. 

predictDDC = DDCpredict(newX,DDCinitX)

names(DDCinitX)
##  [1] "DDCpars"         "colInAnalysis"   "rowInAnalysis"  
##  [4] "namesNotNumeric" "namesCaseNumber" "namesNAcol"     
##  [7] "namesNArow"      "namesDiscrete"   "namesZeroScale" 
## [10] "remX"            "locX"            "scaleX"         
## [13] "Z"               "nbngbrs"         "ngbrs"          
## [16] "robcors"         "robslopes"       "deshrinkage"    
## [19] "Xest"            "scalestres"      "stdResid"       
## [22] "indcells"        "Ti"              "medTi"          
## [25] "madTi"           "indrows"         "indall"         
## [28] "indNAs"          "Ximp"
# For comparison with:

names(predictDDC) # Fewer, since DDCpredict does not call checkDataSet:
##  [1] "DDCpars"     "locX"        "scaleX"      "Z"           "nbngbrs"    
##  [6] "ngbrs"       "robcors"     "robslopes"   "deshrinkage" "Xest"       
## [11] "scalestres"  "stdResid"    "indcells"    "Ti"          "medTi"      
## [16] "madTi"       "indrows"     "indNAs"      "indall"      "Ximp"
# If you specify the parameters the result is the same:
predictDDC2 = DDCpredict(newX,DDCinitX,DDCpars=DDCpars)
all.equal(predictDDC,predictDDC2) # TRUE
## [1] TRUE
ggpnew = cellMap(D=newX,
                 R=predictDDC$stdResid,
                 indcells=predictDDC$indcells,
                 indrows=predictDDC$indrows,
                 standOD=NULL,
                 showVals="R",
                 rowlabels=rowlabels[showrows],
                 columnlabels=columnlabels,
                 mTitle="DDCpredict",
                 adjustrowlabels=0.5, 
                 outlyingGrad = 1)
plot(ggpnew) # Looks quite similar to the result using the entire dataset:

plot of chunk unnamed-chunk-14

# Creating the pdf:
# pdf("TopGear_DDCpredict.pdf",width=20,height=15)
# gridExtra::grid.arrange(ggpDDC,ggpnew,nrow=1) 
# dev.off()

Philips data

The philips data contains 9 measurements of TV parts from a production line.

data(philips)
dim(philips) 
## [1] 677   9
colnames(philips) = c("X1","X2","X3","X4","X5","X6","X7","X8","X9")

DDCphilips = DDC(philips,DDCpars)

qqnorm(as.vector(DDCphilips$Z)) # rather gaussian, here we only see 2 outliers:

plot of chunk unnamed-chunk-15

round(DDCphilips$stdResid[1:12,],1) # the standardized residuals:
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
##  [1,] -0.9  2.2  0.0  1.2  0.8 -2.9  2.1  0.1 -1.4
##  [2,] -0.5  2.0  0.0  3.9  0.8 -1.1  3.5 -0.4 -1.7
##  [3,] -1.0  2.3 -0.4  0.7  0.6 -2.0  1.4  0.0 -1.7
##  [4,] -0.6  2.3 -0.1  3.8  1.1 -0.9  3.4 -0.5 -2.0
##  [5,] -0.6  2.8 -0.2  3.4  0.9 -0.9  3.4  0.2 -1.9
##  [6,] -0.7  2.6  0.0  4.4  1.0 -1.1  4.2 -0.1 -2.0
##  [7,]  1.0 -0.1 -1.0 -0.3 -1.2 -1.0  1.4  0.8 -0.6
##  [8,] -0.4 -1.4 -0.7  1.8 -0.7 -2.8  1.9 -0.3 -1.7
##  [9,]  1.0 -0.2  0.0 -0.1 -1.2 -2.1  1.6  1.0 -0.9
## [10,]  0.2 -1.3 -0.1  1.0 -0.5 -2.9  2.0 -0.1 -1.5
## [11,] -0.1 -0.3 -0.1  0.2 -1.1 -1.5  1.9  0.8 -0.8
## [12,] -0.1 -0.4 -1.5  2.5  0.0 -3.4  2.3  0.3 -2.5
DDCphilips$indcells # indices of the cells that were flagged:
##   [1]   55   70   95  104  116  144  151  156  161  170  174  175  324  468
##  [15]  682  683  712  747  750  792  985  986  987  988  989  991  992 1006
##  [29] 1007 1008 1010 1011 1196 1199 1450 1452 1455 1474 1529 1787 2033 2035
##  [43] 2036 2037 2045 2047 2094 2101 2114 2116 2126 2135 2267 2312 2355 2365
##  [57] 2464 2499 2592 2743 2775 2777 3120 3216 3226 3227 3228 3248 3386 3393
##  [71] 3395 3397 3399 3401 3412 3413 3414 3415 3416 3417 3423 3424 3425 3426
##  [85] 3427 3428 3430 3431 3437 3445 3456 3457 3460 3464 3465 3466 3468 3469
##  [99] 3470 3990 4064 4066 4067 4068 4076 4078 4088 4145 4147 4792 4794 5036
## [113] 5037 5168 5169 5170 5171 5172 5174 5175 5176 5230 5238 5241 5246 5248
## [127] 5250 5253 5256 5258 5259 5260 5262 5263 5265 5267 5268 5270 5510 5511
## [141] 5512 5513 5514 5515 5516 5517 5518 5519 5520 5569 5869 5873 5882 5890
## [155] 5902
DDCphilips$indrows # flagged rows:
## [1]  26  56  84 334 491

We also apply the rowwise method MCD to detect outlying rows:

library(robustbase) # for covMcd
MCDphilips = robustbase::covMcd(philips)
indrowsMCD = which(mahalanobis(philips,MCDphilips$center,
                               MCDphilips$cov) > qchisq(0.975,df=d))

plot(sqrt(mahalanobis(philips,MCDphilips$center,MCDphilips$cov)),
     main="Philips data",ylab="Robust distances",xlab="",pch=20)
abline(h=sqrt(qchisq(0.975,df=9))) # this horizontal line is the cutoff.

plot of chunk unnamed-chunk-17

# dev.copy(pdf,"Figure_philips_left.pdf",width=10,height=4)
# dev.off()
# cellMaps with rectangular blocks:

n = nrow(philips)
nrowsinblock = 15
rowlabels = 1:n

d = ncol(philips)
ncolumnsinblock = 1
columnlabels = colnames(philips)

ggpMCDphilips = cellMap(D=philips,
                        R=matrix(0,n,d),
                        indcells=integer(0),
                        indrows=indrowsMCD,
                        rowlabels=rowlabels,
                        columnlabels=columnlabels,                        
                        mTitle="MCD",
                        nrowsinblock=nrowsinblock,
                        ncolumnsinblock=ncolumnsinblock,                        
                        autolabel=T)
plot(ggpMCDphilips)

plot of chunk unnamed-chunk-18

ggpDDCphilips = cellMap(D=philips, 
                        R=DDCphilips$stdResid,
                        indcells=DDCphilips$indcells,
                        indrows=DDCphilips$indrows,
                        rowlabels=rowlabels,
                        columnlabels=columnlabels,                        
                        mTitle="DetectDeviatingCells",
                        nrowsinblock=nrowsinblock,
                        ncolumnsinblock=ncolumnsinblock,
                        autolabel=T,
                        adjustrowlabels=1)
plot(ggpDDCphilips)

plot of chunk unnamed-chunk-18

# dev.copy(pdf,"Figure_philips_right.pdf",width=6,height=12)
# dev.off()

Mortality dataset

This dataset contains the mortality of French males in the years 1816 to 2013.

data(mortality)
dim(mortality)
## [1] 198  91
# 198  91
rownames(mortality)[1:5] 
## [1] "1816" "1817" "1818" "1819" "1820"
colnames(mortality)[1:5] 
## [1] "0" "1" "2" "3" "4"
DDCmortality = DDC(mortality,DDCpars) # 1 second

remX = DDCmortality$remX
dim(remX)
## [1] 198  91

We also apply the rowwise method ROBPCA to detect outlying rows:

library(rrcov) # contains ROBPCA
PCAmortality = rrcov::PcaHubert(mortality,alpha=0.75,scale=FALSE)

n = nrow(remX)
nrowsinblock = 5
rowlabels = rownames(remX)

d = ncol(remX)
ncolumnsinblock = 5
columnlabels = colnames(remX)

ggpROBPCA = cellMap(D=remX,
                    R=matrix(0,n,d),
                    indrows=which(PCAmortality@flag==FALSE),
                    rowlabels=rowlabels,
                    columnlabels=columnlabels,                    
                    mTitle="By row",
                    nrowsinblock=nrowsinblock,
                    ncolumnsinblock=ncolumnsinblock, 
                    rowtitle = "Years",
                    columntitle = "Age",
                    sizetitles = 2.0,
                    autolabel=T)
plot(ggpROBPCA)

plot of chunk unnamed-chunk-21

ggpDDC = cellMap(D=remX, 
                 R=DDCmortality$stdResid,
                 indcells=DDCmortality$indcells,
                 indrows=DDCmortality$indrows,
                 rowlabels=rowlabels,
                 columnlabels=columnlabels,                 
                 mTitle="DetectDeviatingCells",
                 nrowsinblock=nrowsinblock,
                 ncolumnsinblock=ncolumnsinblock,
                 rowtitle = "Years",
                 columntitle = "Age",
                 sizetitles = 2.0,
                 autolabel=T)
plot(ggpDDC) # Leads to a detailed interpretation:

plot of chunk unnamed-chunk-21

# pdf("cellmap_mortality.pdf",width=14,height=12)
# gridExtra::grid.arrange(ggpROBPCA,ggpDDC,nrow=1)
# dev.off()

Glass dataset

The glass data consists of spectra with 750 wavelengths of 180 archaeological glass samples.

data(glass)
DDCglass = DDC(glass,DDCpars) # takes 8 seconds
##  
##  The final data set we will analyze has 180 rows and 737 columns.
## 
remX = DDCglass$remX
# With DDCpars$silent = FALSE we obtain more information:
#
#  The input data has 180 rows and 750 columns.
#
#  The data contained 11 discrete columns with 3 or fewer values.
#  Their column names are:
#
#  [1] V1  V2  V3  V4  V5  V6  V7  V8  V9  V10 V11
#
#  These columns will be ignored in the analysis.
#  We continue with the remaining 739 columns:
#  
#   [1] V12  V13  V14  V15  V16  V17  V18  V19  V20  V21  V22  V23  V24  V25
#    ......
# [729] V740 V741 V742 V743 V744 V745 V746 V747 V748 V749 V750
#
#  The data contained 2 columns with zero or tiny median absolute deviation.
#  Their column names are:
#
# [1] V12 V13
#
#  These columns will be ignored in the analysis.
#  We continue with the remaining 737 columns:
#
#   [1] V14  V15  V16  V17  V18  V19  V20  V21  V22  V23  V24  V25  V26  V27
#    ......
# [729] V742 V743 V744 V745 V746 V747 V748 V749 V750
#
#  The final data set we will analyze has 180 rows and 737 columns.

dim(remX)
## [1] 180 737

We will compare this with the faster approximate algorithm of DDC, obtained by the option fastDDC=TRUE:

fastDDCpars = list(fastDDC = TRUE, silent = TRUE)
fastDDCglass = DDC(glass, fastDDCpars) # takes 2 seconds
##  
##  The final data set we will analyze has 180 rows and 737 columns.
## 
remXfast = fastDDCglass$remX
all.equal(remX,remXfast) # The remaining data is the same:
## [1] TRUE

We also apply the rowwise method ROBPCA to detect outlying rows:

library(rrcov) # contains ROBPCA
PCAglass = rrcov::PcaHubert(remX,alpha=0.75,scale=FALSE)

n = nrow(remX)
nrowsinblock = 5
rowtitle = "glass samples"
rowlabels = rep("",floor(n/nrowsinblock));
rowlabels[1] = "1"
rowlabels[floor(n/ncolumnsinblock)] = "n";
# rowlabels

d = ncol(remX)
ncolumnsinblock = 5
columntitle = "wavelengths"
columnlabels = rep("",floor(d/ncolumnsinblock));
columnlabels[1] = "1";
columnlabels[floor(d/nrowsinblock)] = "d"
# columnlabels

ggpROBPCA = cellMap(D=remX, 
                    R=matrix(0,n,d),
                    indcells=integer(0),
                    indrows=which(PCAglass@flag==FALSE),
                    rowlabels=rowlabels,
                    columnlabels=columnlabels,
                    mTitle="By row",
                    nrowsinblock=nrowsinblock,
                    ncolumnsinblock=ncolumnsinblock,
                    columnangle=0,
                    rowtitle=rowtitle,
                    columntitle=columntitle,
                    sizetitles=1.5,
                    autolabel=F)
plot(ggpROBPCA)

plot of chunk unnamed-chunk-25

ggpDDC = cellMap(D=remX, 
                 R=DDCglass$stdResid,
                 indcells=DDCglass$indcells,
                 indrows=DDCglass$indrows,
                 rowlabels=rowlabels,
                 columnlabels=columnlabels,
                 mTitle="DDC",
                 nrowsinblock=nrowsinblock,
                 ncolumnsinblock=ncolumnsinblock,
                 columnangle=0,
                 rowtitle=rowtitle,
                 columntitle=columntitle,
                 sizetitles=1.5,
                 autolabel=F)
plot(ggpDDC)

plot of chunk unnamed-chunk-25

# pdf("cellmap_glass_ROBPCA_DDC.pdf",width=16,height=10)
# gridExtra::grid.arrange(ggpROBPCA,ggpDDC,ncol=1)
# dev.off()

ggpfastDDC = cellMap(D=remXfast, 
                     R=fastDDCglass$stdResid,
                     indcells=fastDDCglass$indcells,
                     indrows=fastDDCglass$indrows,
                     rowlabels=rowlabels,
                     columnlabels=columnlabels,
                     mTitle="fast DDC",
                     nrowsinblock=nrowsinblock,
                     ncolumnsinblock=ncolumnsinblock,
                     columnangle=0,
                     rowtitle=rowtitle,
                     columntitle=columntitle,    
                     sizetitles=1.5,
                     autolabel=F)
plot(ggpfastDDC)

plot of chunk unnamed-chunk-25

# pdf("cellmap_glass_DDC_fastDDC.pdf",width=16,height=10)
# gridExtra::grid.arrange(ggpDDC,ggpfastDDC,ncol=1)
# dev.off()