Melanie Tietje
February 2019
Charles Edward Spearman (1863 – 1945) was an English psychologist.
“The p-value is the probability of observing data at least as favorable to the alternative hypothesis as our current data set, if the null hypothesis is true” (Open Intro Statistics)
library(magrittr)
x <- runif(100, min = 0, max = 1) # our x variable
start <- 10 # minimum degree variation in y
end <- 95 # maximum degree variation in y
# create data
aval <- list()
for(step in start:end){
scramble <- sample(c(1:100), step*0.01*length(x), replace = FALSE) # define which numbers to alter
temp <- data.frame(x=x, y=x)
temp$y[scramble] %<>% runif # actual randomization
aval[[step]] <-list(visible = FALSE,
name = paste0('v = ', step),
x=x,
y=temp$y,
rho=cor.test(x,temp$y,method="s")$estimate[[1]],
p=cor.test(x,temp$y,method="s")$p.[[1]]
)
}
load("spearman.RData")
head(dat, 20)
x y random
1 0.63860736 0.63860736 10
2 0.64373766 0.64373766 10
3 0.09359379 0.09359379 10
4 0.42091535 0.53961715 10
5 0.03532037 0.03532037 10
6 0.31653066 0.31653066 10
7 0.89893546 0.89893546 10
8 0.23923984 0.23923984 10
9 0.69392201 0.69392201 10
10 0.89247378 0.89247378 10
11 0.68513072 0.68513072 10
12 0.10910229 0.10910229 10
13 0.82045396 0.82045396 10
14 0.70001250 0.70001250 10
15 0.71046496 0.71046496 10
16 0.45163949 0.45163949 10
17 0.14559774 0.14559774 10
18 0.11367611 0.11859527 10
19 0.89437855 0.89437855 10
20 0.19451756 0.19451756 10
library(dplyr)
samplesize <- c()
rho <- c()
p.value <- c()
randomization <- c()
for(j in 1:length(samples_per_group)){ # run for each samplesize
sub <- dat %>% # take subset from each randomization group
group_by(random) %>%
slice(sample(n(), min(samples_per_group[j], n()))) %>%
ungroup()
rho.sub <- c()
p.sub <- c()
for(i in start:end){ # calculate correlation for each randomization
rho <- c(rho, cor.test(sub$x[sub$random==i],sub$y[sub$random==i],
method="s")$estimate[[1]])
p.value <- c(p.value, cor.test(sub$x[sub$random==i],sub$y[sub$random==i],
method="s")$p.[[1]])
randomization <- c(randomization, i)
samplesize <- c(samplesize, samples_per_group[j])
}
}
all <- data.frame(rho=rho, p.value=p.value, randomization=randomization, samplesize=samplesize)
head(all, 20)
rho p.value randomization samplesize
1 0.9130879 0 10 95
2 0.9218785 0 11 95
3 0.8516797 0 12 95
4 0.8853443 0 13 95
5 0.9409854 0 14 95
6 0.9175392 0 15 95
7 0.8464306 0 16 95
8 0.8506019 0 17 95
9 0.8666433 0 18 95
10 0.8618841 0 19 95
11 0.7955067 0 20 95
12 0.6642497 0 21 95
13 0.8080627 0 22 95
14 0.7210386 0 23 95
15 0.8103863 0 24 95
16 0.7507279 0 25 95
17 0.7162514 0 26 95
18 0.8047312 0 27 95
19 0.7423992 0 28 95
20 0.7844765 0 29 95
summary(lm(data=res, log(smallest_significant_rho)~samplesize))
Call:
lm(formula = log(smallest_significant_rho) ~ samplesize, data = res)
Residuals:
Min 1Q Median 3Q Max
-0.11339 -0.08851 -0.01100 0.05852 0.23767
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.5553022 0.0559552 -9.924 3.05e-08 ***
samplesize -0.0115485 0.0009555 -12.086 1.86e-09 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.1052 on 16 degrees of freedom
Multiple R-squared: 0.9013, Adjusted R-squared: 0.8951
F-statistic: 146.1 on 1 and 16 DF, p-value: 1.855e-09
–> Looks like every sort of correlation gets strong if sample size is large enough
–> low significant correlations always require large sample sizes