There are three kind of variables
dbl
in data.frame
but there are only two distinct value (like 1,0).
library(tidyverse)
#> ─ Attaching packages ────────────────────────────────────── tidyverse 1.2.1 ─
#> ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
#> ✔ tibble 1.4.2 ✔ dplyr 0.7.6
#> ✔ tidyr 0.8.1 ✔ stringr 1.3.1
#> ✔ readr 1.1.1 ✔ forcats 0.3.0
#> ─ Conflicts ──────────────────────────────────────── tidyverse_conflicts() ─
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag() masks stats::lag()
set.seed(123)
sample_data <-
cbind(
matrix(
c(
1:100
)
,nrow = 10
,byrow = F
)
,matrix(
c(
letters[1:20]
)
,nrow = 10
,byrow = T
)
,matrix(
c(
rep(0:1,5)
)
,nrow = 10
,byrow = T
)
) %>%
`colnames<-`(paste0('v',1:13)) %>%
`rownames<-`(1:10) %>%
as.data.frame() %>%
mutate_at(vars(1:10,13),as.integer) %>%
mutate_at(vars(1:10),~cut_number(.,3))
sample_data
#> v1 v2 v3 v4 v5 v6 v7 v8 v9 v10
#> 1 [1,4] [1,4] [1,4] [1,4] [1,4] [1,4] [1,4] [1,4] [1,4] [1,4]
#> 2 [1,4] [1,4] [1,4] [1,4] [1,4] [1,4] [1,4] [1,4] [1,4] [1,4]
#> 3 [1,4] [1,4] [1,4] [1,4] [1,4] [1,4] [1,4] [1,4] [1,4] [1,4]
#> 4 (4,7] [1,4] [1,4] [1,4] [1,4] [1,4] [1,4] [1,4] [1,4] (4,7]
#> 5 (4,7] (4,7] (4,7] (4,7] (4,7] (4,7] (4,7] (4,7] (4,7] (4,7]
#> 6 (4,7] (4,7] (4,7] (4,7] (4,7] (4,7] (4,7] (4,7] (4,7] (4,7]
#> 7 (7,10] (4,7] (4,7] (4,7] (4,7] (4,7] (4,7] (4,7] (4,7] (7,10]
#> 8 (7,10] (7,10] (7,10] (7,10] (7,10] (7,10] (7,10] (7,10] (7,10] (7,10]
#> 9 (7,10] (7,10] (7,10] (7,10] (7,10] (7,10] (7,10] (7,10] (7,10] (7,10]
#> 10 [1,4] (7,10] (7,10] (7,10] (7,10] (7,10] (7,10] (7,10] (7,10] [1,4]
#> v11 v12 v13
#> 1 a b 1
#> 2 c d 2
#> 3 e f 1
#> 4 g h 2
#> 5 i j 1
#> 6 k l 2
#> 7 m n 1
#> 8 o p 2
#> 9 q r 1
#> 10 s t 2
library(caret)
#> 载入需要的程辑包:lattice
#>
#> 载入程辑包:'caret'
#> The following object is masked from 'package:purrr':
#>
#> lift
sample_data_after_onehot <-
sample_data %>%
predict(dummyVars(~.,data=.),newdata =.) %>%
as.data.frame()
sample_data_after_onehot
#> v1.[1,4] v1.(4,7] v1.(7,10] v2.[1,4] v2.(4,7] v2.(7,10] v3.[1,4]
#> 1 1 0 0 1 0 0 1
#> 2 1 0 0 1 0 0 1
#> 3 1 0 0 1 0 0 1
#> 4 0 1 0 1 0 0 1
#> 5 0 1 0 0 1 0 0
#> 6 0 1 0 0 1 0 0
#> 7 0 0 1 0 1 0 0
#> 8 0 0 1 0 0 1 0
#> 9 0 0 1 0 0 1 0
#> 10 1 0 0 0 0 1 0
#> v3.(4,7] v3.(7,10] v4.[1,4] v4.(4,7] v4.(7,10] v5.[1,4] v5.(4,7]
#> 1 0 0 1 0 0 1 0
#> 2 0 0 1 0 0 1 0
#> 3 0 0 1 0 0 1 0
#> 4 0 0 1 0 0 1 0
#> 5 1 0 0 1 0 0 1
#> 6 1 0 0 1 0 0 1
#> 7 1 0 0 1 0 0 1
#> 8 0 1 0 0 1 0 0
#> 9 0 1 0 0 1 0 0
#> 10 0 1 0 0 1 0 0
#> v5.(7,10] v6.[1,4] v6.(4,7] v6.(7,10] v7.[1,4] v7.(4,7] v7.(7,10]
#> 1 0 1 0 0 1 0 0
#> 2 0 1 0 0 1 0 0
#> 3 0 1 0 0 1 0 0
#> 4 0 1 0 0 1 0 0
#> 5 0 0 1 0 0 1 0
#> 6 0 0 1 0 0 1 0
#> 7 0 0 1 0 0 1 0
#> 8 1 0 0 1 0 0 1
#> 9 1 0 0 1 0 0 1
#> 10 1 0 0 1 0 0 1
#> v8.[1,4] v8.(4,7] v8.(7,10] v9.[1,4] v9.(4,7] v9.(7,10] v10.[1,4]
#> 1 1 0 0 1 0 0 1
#> 2 1 0 0 1 0 0 1
#> 3 1 0 0 1 0 0 1
#> 4 1 0 0 1 0 0 0
#> 5 0 1 0 0 1 0 0
#> 6 0 1 0 0 1 0 0
#> 7 0 1 0 0 1 0 0
#> 8 0 0 1 0 0 1 0
#> 9 0 0 1 0 0 1 0
#> 10 0 0 1 0 0 1 1
#> v10.(4,7] v10.(7,10] v11.a v11.c v11.e v11.g v11.i v11.k v11.m v11.o
#> 1 0 0 1 0 0 0 0 0 0 0
#> 2 0 0 0 1 0 0 0 0 0 0
#> 3 0 0 0 0 1 0 0 0 0 0
#> 4 1 0 0 0 0 1 0 0 0 0
#> 5 1 0 0 0 0 0 1 0 0 0
#> 6 1 0 0 0 0 0 0 1 0 0
#> 7 0 1 0 0 0 0 0 0 1 0
#> 8 0 1 0 0 0 0 0 0 0 1
#> 9 0 1 0 0 0 0 0 0 0 0
#> 10 0 0 0 0 0 0 0 0 0 0
#> v11.q v11.s v12.b v12.d v12.f v12.h v12.j v12.l v12.n v12.p v12.r v12.t
#> 1 0 0 1 0 0 0 0 0 0 0 0 0
#> 2 0 0 0 1 0 0 0 0 0 0 0 0
#> 3 0 0 0 0 1 0 0 0 0 0 0 0
#> 4 0 0 0 0 0 1 0 0 0 0 0 0
#> 5 0 0 0 0 0 0 1 0 0 0 0 0
#> 6 0 0 0 0 0 0 0 1 0 0 0 0
#> 7 0 0 0 0 0 0 0 0 1 0 0 0
#> 8 0 0 0 0 0 0 0 0 0 1 0 0
#> 9 1 0 0 0 0 0 0 0 0 0 1 0
#> 10 0 1 0 0 0 0 0 0 0 0 0 1
#> v13
#> 1 1
#> 2 2
#> 3 1
#> 4 2
#> 5 1
#> 6 2
#> 7 1
#> 8 2
#> 9 1
#> 10 2
library(irlba)
#> 载入需要的程辑包:Matrix
#>
#> 载入程辑包:'Matrix'
#> The following object is masked from 'package:tidyr':
#>
#> expand
pca_score <- prcomp_irlba(sample_data_after_onehot,n=2,center = T,scale. = F)
sample_data
is discretized for all continuous variable by cut_number
.prcomp_irlba
instead of prcomp
partly because prcomp
is hard to train big data.dummyVars
function I use is to one-hot encoding the category variables.r_output <-
pca_score$rotation %>%
`rownames<-`(sample_data_after_onehot %>% names) %>%
as.data.frame() %>%
rownames_to_column('variable')
r_output
#> variable PC1 PC2
#> 1 v1.[1,4] 0.181584194 -0.0767965148
#> 2 v1.(4,7] -0.036772084 0.1760748168
#> 3 v1.(7,10] -0.144812109 -0.0992783020
#> 4 v2.[1,4] 0.262610278 0.0202091466
#> 5 v2.(4,7] -0.145356978 0.2172683960
#> 6 v2.(7,10] -0.117253300 -0.2374775426
#> 7 v3.[1,4] 0.262610278 0.0202091466
#> 8 v3.(4,7] -0.145356978 0.2172683960
#> 9 v3.(7,10] -0.117253300 -0.2374775426
#> 10 v4.[1,4] 0.262610278 0.0202091466
#> 11 v4.(4,7] -0.145356978 0.2172683960
#> 12 v4.(7,10] -0.117253300 -0.2374775426
#> 13 v5.[1,4] 0.262610278 0.0202091466
#> 14 v5.(4,7] -0.145356978 0.2172683960
#> 15 v5.(7,10] -0.117253300 -0.2374775426
#> 16 v6.[1,4] 0.262610278 0.0202091466
#> 17 v6.(4,7] -0.145356978 0.2172683960
#> 18 v6.(7,10] -0.117253300 -0.2374775426
#> 19 v7.[1,4] 0.262610278 0.0202091466
#> 20 v7.(4,7] -0.145356978 0.2172683960
#> 21 v7.(7,10] -0.117253300 -0.2374775426
#> 22 v8.[1,4] 0.262610278 0.0202091466
#> 23 v8.(4,7] -0.145356978 0.2172683960
#> 24 v8.(7,10] -0.117253300 -0.2374775426
#> 25 v9.[1,4] 0.262610278 0.0202091466
#> 26 v9.(4,7] -0.145356978 0.2172683960
#> 27 v9.(7,10] -0.117253300 -0.2374775426
#> 28 v10.[1,4] 0.181584194 -0.0767965148
#> 29 v10.(4,7] -0.036772084 0.1760748168
#> 30 v10.(7,10] -0.144812109 -0.0992783020
#> 31 v11.a 0.068936220 0.0014917292
#> 32 v11.c 0.069168005 -0.0009421594
#> 33 v11.e 0.068936220 0.0014917292
#> 34 v11.g 0.055569833 0.0181678477
#> 35 v11.i -0.046286851 0.0801704289
#> 36 v11.k -0.046055067 0.0777365403
#> 37 v11.m -0.053015061 0.0593614268
#> 38 v11.o -0.045782632 -0.0805368087
#> 39 v11.q -0.046014417 -0.0781029201
#> 40 v11.s -0.025456251 -0.0788378138
#> 41 v12.b 0.068936220 0.0014917292
#> 42 v12.d 0.069168005 -0.0009421594
#> 43 v12.f 0.068936220 0.0014917292
#> 44 v12.h 0.055569833 0.0181678477
#> 45 v12.j -0.046286851 0.0801704289
#> 46 v12.l -0.046055067 0.0777365403
#> 47 v12.n -0.053015061 0.0593614268
#> 48 v12.p -0.045782632 -0.0805368087
#> 49 v12.r -0.046014417 -0.0781029201
#> 50 v12.t -0.025456251 -0.0788378138
#> 51 v13 0.007443888 -0.0644123940
r_output
is what I get when I finish the pca model.
Now, we recode it in SQL style.
library(add2deployment)
trans_inSQL(input=r_output)
#> Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [51].
#>
#> 载入程辑包:'glue'
#> The following object is masked from 'package:dplyr':
#>
#> collapse
#> Warning in bind_rows_(x, .id): Vectorizing 'glue' elements may not preserve
#> their attributes
#> Warning in bind_rows_(x, .id): Vectorizing 'glue' elements may not preserve
#> their attributes
#> select user_id
#>
#>
#> ,
#>
#>
#> ifnull(cast((v1>[1 and v1<=4) as double),0)*0.181584193848469+
#> ifnull(cast((v1>4 and v1<=7) as double),0)*-0.0367720844560349+
#> ifnull(cast((v1>7 and v1<=10) as double),0)*-0.144812109392434+
#> ifnull(cast((v2>[1 and v2<=4) as double),0)*0.262610278076872+
#> ifnull(cast((v2>4 and v2<=7) as double),0)*-0.145356978354161+
#> ifnull(cast((v2>7 and v2<=10) as double),0)*-0.117253299722711+
#> ifnull(cast((v3>[1 and v3<=4) as double),0)*0.262610278076872+
#> ifnull(cast((v3>4 and v3<=7) as double),0)*-0.145356978354161+
#> ifnull(cast((v3>7 and v3<=10) as double),0)*-0.117253299722711+
#> ifnull(cast((v4>[1 and v4<=4) as double),0)*0.262610278076872+
#> ifnull(cast((v4>4 and v4<=7) as double),0)*-0.145356978354161+
#> ifnull(cast((v4>7 and v4<=10) as double),0)*-0.117253299722711+
#> ifnull(cast((v5>[1 and v5<=4) as double),0)*0.262610278076872+
#> ifnull(cast((v5>4 and v5<=7) as double),0)*-0.145356978354161+
#> ifnull(cast((v5>7 and v5<=10) as double),0)*-0.117253299722711+
#> ifnull(cast((v6>[1 and v6<=4) as double),0)*0.262610278076872+
#> ifnull(cast((v6>4 and v6<=7) as double),0)*-0.145356978354161+
#> ifnull(cast((v6>7 and v6<=10) as double),0)*-0.117253299722711+
#> ifnull(cast((v7>[1 and v7<=4) as double),0)*0.262610278076872+
#> ifnull(cast((v7>4 and v7<=7) as double),0)*-0.145356978354161+
#> ifnull(cast((v7>7 and v7<=10) as double),0)*-0.117253299722711+
#> ifnull(cast((v8>[1 and v8<=4) as double),0)*0.262610278076872+
#> ifnull(cast((v8>4 and v8<=7) as double),0)*-0.145356978354161+
#> ifnull(cast((v8>7 and v8<=10) as double),0)*-0.117253299722711+
#> ifnull(cast((v9>[1 and v9<=4) as double),0)*0.262610278076872+
#> ifnull(cast((v9>4 and v9<=7) as double),0)*-0.145356978354161+
#> ifnull(cast((v9>7 and v9<=10) as double),0)*-0.117253299722711+
#> ifnull(cast((v10>[1 and v10<=4) as double),0)*0.181584193848469+
#> ifnull(cast((v10>4 and v10<=7) as double),0)*-0.0367720844560348+
#> ifnull(cast((v10>7 and v10<=10) as double),0)*-0.144812109392434+
#> ifnull(cast(v11='a' as double),0)*0.0689362200661296+
#> ifnull(cast(v11='c' as double),0)*0.0691680047254186+
#> ifnull(cast(v11='e' as double),0)*0.0689362200661296+
#> ifnull(cast(v11='g' as double),0)*0.0555698332191943+
#> ifnull(cast(v11='i' as double),0)*-0.0462868511672591+
#> ifnull(cast(v11='k' as double),0)*-0.0460550665079701+
#> ifnull(cast(v11='m' as double),0)*-0.0530150606789322+
#> ifnull(cast(v11='o' as double),0)*-0.0457826320271065+
#> ifnull(cast(v11='q' as double),0)*-0.0460144166863955+
#> ifnull(cast(v11='s' as double),0)*-0.025456251009209+
#> ifnull(cast(v12='b' as double),0)*0.0689362200661297+
#> ifnull(cast(v12='d' as double),0)*0.0691680047254186+
#> ifnull(cast(v12='f' as double),0)*0.0689362200661297+
#> ifnull(cast(v12='h' as double),0)*0.0555698332191943+
#> ifnull(cast(v12='j' as double),0)*-0.0462868511672591+
#> ifnull(cast(v12='l' as double),0)*-0.0460550665079701+
#> ifnull(cast(v12='n' as double),0)*-0.0530150606789322+
#> ifnull(cast(v12='p' as double),0)*-0.0457826320271065+
#> ifnull(cast(v12='r' as double),0)*-0.0460144166863955+
#> ifnull(cast(v12='t' as double),0)*-0.025456251009209 as pc1,
#>
#>
#> ifnull(cast((v1>[1 and v1<=4) as double),0)*-0.0767965148209343+
#> ifnull(cast((v1>4 and v1<=7) as double),0)*0.176074816813784+
#> ifnull(cast((v1>7 and v1<=10) as double),0)*-0.0992783019928502+
#> ifnull(cast((v2>[1 and v2<=4) as double),0)*0.0202091466372069+
#> ifnull(cast((v2>4 and v2<=7) as double),0)*0.217268395959826+
#> ifnull(cast((v2>7 and v2<=10) as double),0)*-0.237477542597033+
#> ifnull(cast((v3>[1 and v3<=4) as double),0)*0.0202091466372069+
#> ifnull(cast((v3>4 and v3<=7) as double),0)*0.217268395959826+
#> ifnull(cast((v3>7 and v3<=10) as double),0)*-0.237477542597033+
#> ifnull(cast((v4>[1 and v4<=4) as double),0)*0.0202091466372069+
#> ifnull(cast((v4>4 and v4<=7) as double),0)*0.217268395959826+
#> ifnull(cast((v4>7 and v4<=10) as double),0)*-0.237477542597033+
#> ifnull(cast((v5>[1 and v5<=4) as double),0)*0.0202091466372069+
#> ifnull(cast((v5>4 and v5<=7) as double),0)*0.217268395959826+
#> ifnull(cast((v5>7 and v5<=10) as double),0)*-0.237477542597033+
#> ifnull(cast((v6>[1 and v6<=4) as double),0)*0.0202091466372069+
#> ifnull(cast((v6>4 and v6<=7) as double),0)*0.217268395959826+
#> ifnull(cast((v6>7 and v6<=10) as double),0)*-0.237477542597033+
#> ifnull(cast((v7>[1 and v7<=4) as double),0)*0.0202091466372069+
#> ifnull(cast((v7>4 and v7<=7) as double),0)*0.217268395959826+
#> ifnull(cast((v7>7 and v7<=10) as double),0)*-0.237477542597033+
#> ifnull(cast((v8>[1 and v8<=4) as double),0)*0.0202091466372069+
#> ifnull(cast((v8>4 and v8<=7) as double),0)*0.217268395959826+
#> ifnull(cast((v8>7 and v8<=10) as double),0)*-0.237477542597033+
#> ifnull(cast((v9>[1 and v9<=4) as double),0)*0.0202091466372069+
#> ifnull(cast((v9>4 and v9<=7) as double),0)*0.217268395959826+
#> ifnull(cast((v9>7 and v9<=10) as double),0)*-0.237477542597033+
#> ifnull(cast((v10>[1 and v10<=4) as double),0)*-0.0767965148209343+
#> ifnull(cast((v10>4 and v10<=7) as double),0)*0.176074816813785+
#> ifnull(cast((v10>7 and v10<=10) as double),0)*-0.0992783019928501+
#> ifnull(cast(v11='a' as double),0)*0.00149172918719605+
#> ifnull(cast(v11='c' as double),0)*-0.000942159419657452+
#> ifnull(cast(v11='e' as double),0)*0.00149172918719604+
#> ifnull(cast(v11='g' as double),0)*0.0181678476824723+
#> ifnull(cast(v11='i' as double),0)*0.0801704288690829+
#> ifnull(cast(v11='k' as double),0)*0.0777365402622294+
#> ifnull(cast(v11='m' as double),0)*0.0593614268285136+
#> ifnull(cast(v11='o' as double),0)*-0.0805368087141086+
#> ifnull(cast(v11='q' as double),0)*-0.0781029201072551+
#> ifnull(cast(v11='s' as double),0)*-0.078837813775669+
#> ifnull(cast(v12='b' as double),0)*0.00149172918719604+
#> ifnull(cast(v12='d' as double),0)*-0.000942159419657456+
#> ifnull(cast(v12='f' as double),0)*0.00149172918719604+
#> ifnull(cast(v12='h' as double),0)*0.0181678476824723+
#> ifnull(cast(v12='j' as double),0)*0.0801704288690829+
#> ifnull(cast(v12='l' as double),0)*0.0777365402622293+
#> ifnull(cast(v12='n' as double),0)*0.0593614268285135+
#> ifnull(cast(v12='p' as double),0)*-0.0805368087141086+
#> ifnull(cast(v12='r' as double),0)*-0.0781029201072551+
#> ifnull(cast(v12='t' as double),0)*-0.078837813775669 as pc2
#>
#>
#> from yourdataset