324 lines
9.0 KiB
Plaintext
324 lines
9.0 KiB
Plaintext
---
|
|
title: "Comparaison of methods"
|
|
output: pdf_document
|
|
---
|
|
|
|
# Scan statistique - Méthode de Monte Carlo et calcul de p-value
|
|
|
|
## Import libraries
|
|
```{r}
|
|
library("localScore")
|
|
library("latex2exp")
|
|
library("Rcpp")
|
|
```
|
|
|
|
## 1. Proposition for simulations under $\mathcal{H}_1$
|
|
|
|
In this part, we propose a method that simulates a Poisson process under the hypothesis $\mathcal{H}_1$. The idea is to simulate a sample under $\mathcal{H}_0$, and add randomly a subsequence under the alternative hypothesis in this sequence.
|
|
```{r}
|
|
PoissonProcess <- function(lambda,T) {
|
|
return(sort(runif(rpois(1,lambda*T),0,T)))
|
|
}
|
|
|
|
SimulationH1 <- function(lambda0, lambda1,T,tau){
|
|
ppH0=PoissonProcess(lambda0,T)
|
|
ppH1.segt=PoissonProcess(lambda1,tau)
|
|
dbt=runif(1,0,T-tau)
|
|
ppH0bis=PoissonProcess(lambda0,T)
|
|
ppH1.repo=dbt+ppH1.segt
|
|
ppH0_avant=ppH0bis[which(ppH0bis<ppH1.repo[1])]
|
|
ppH0_apres=ppH0bis[which(ppH0bis>ppH1.repo[length(ppH1.repo)])]
|
|
ppH1=c(ppH0_avant,ppH1.repo,ppH0_apres)
|
|
return (c(ppH1,which(ppH1==min(ppH1.repo))))
|
|
}
|
|
```
|
|
|
|
|
|
```{r}
|
|
TimeBetweenEvent <- function(pp){
|
|
n=length(pp)
|
|
tbe=pp[2:n]-pp[1:n1-1]
|
|
tbe=c(0,tbe)
|
|
return (tbe)
|
|
}
|
|
|
|
DataFrame <- function(pp,tbe){
|
|
list=data.frame(ProcessusPoisson=pp, TimeBetweenEvent=tbe)
|
|
}
|
|
```
|
|
|
|
## 2. Simulation of the sequences under $\mathcal{H}_0$ via a Monte Carlo Method
|
|
In this part, we will try to simulate, using a Monte Carlo method, a set of $10^5$ independant samples, under the assumption that $\lambda=\lambda_0$, hence, that we are under the null hypothesis $\mathcal{H}_0$.
|
|
```{r}
|
|
ScanStat <- function(pp, T, tau){
|
|
n=length(pp)
|
|
stop=n-length(which(pp>(T-tau)))
|
|
ScanStat=0
|
|
for (i in (1:stop)) {
|
|
x=which((pp>=pp[i])&(pp<=(pp[i]+tau)))
|
|
scan=length(x)
|
|
if (scan>ScanStat) {ScanStat=scan}
|
|
}
|
|
return (c(i,ScanStat))
|
|
}
|
|
```
|
|
|
|
We test the scan statistic method for different values of $\lambda_0$. The method of scan statistic we implemented will allow us to have access to the scan test statistic and where it happens in the sequence.
|
|
```{r}
|
|
EmpDistrib <- function(lambda, n_sample,T,tau){
|
|
pp=PoissonProcess(lambda,T)
|
|
scan=c(ScanStat(pp,T, tau)[2])
|
|
index=c(ScanStat(pp,T, tau)[1])
|
|
for (i in 2:(n_sample)){
|
|
pp=PoissonProcess(lambda,T)
|
|
scan=rbind(scan,ScanStat(pp,T, tau)[2])
|
|
index=rbind(index,ScanStat(pp,T, tau)[1])
|
|
}
|
|
min_scan=min(scan)-1
|
|
max_scan=max(scan)
|
|
table1=table(factor(scan, levels = min_scan:max_scan))
|
|
EmpDis=data.frame(cdf=cumsum(table1)/sum(table1), proba=table1/sum(table1), index_scan=min_scan:max_scan)
|
|
EmpDis<-EmpDis[,-2]
|
|
return(EmpDis)
|
|
}
|
|
```
|
|
```{r}
|
|
Plot_CDF <- function(lambda,n_sample,T,tau){
|
|
Emp=EmpDistrib(lambda,n_sample,T,tau)
|
|
title=TeX(paste(r'(Cumulative distribution function for $\lambda=$)', lambda))
|
|
plot(Emp$index_scan, Emp$cdf,type="s",xlab="Number of occurrences",ylab="Probability", main=title, col="red")
|
|
return(Emp)
|
|
}
|
|
```
|
|
### 2.1 Test of $\mathcal{H}_0: \lambda=\lambda_0$ against $\mathcal{H}_0: \lambda=\lambda_1$, where $\lambda_1 > \lambda_0$
|
|
In this part, we will test different values for $\lambda_0$ and $\lambda_1$, and compute the probability of occurrence of a certain scan statistic.
|
|
|
|
```{r}
|
|
#Empiricial distribution under H0
|
|
n_sample=10**4
|
|
lambda0=3
|
|
T=10
|
|
tau=1
|
|
ppH0=PoissonProcess(lambda0,T)
|
|
CDF=Plot_CDF(lambda0,n_sample,T,tau)
|
|
```
|
|
```{r}
|
|
n_sample=10**4
|
|
lambda1=4
|
|
T=10
|
|
tau=1
|
|
ppH0=PoissonProcess(lambda1,T)
|
|
CDF=Plot_CDF(lambda1,n_sample,T,tau)
|
|
```
|
|
|
|
```{r}
|
|
PValue <- function(Emp,ppH1, T, tau){
|
|
scanH1=ScanStat(ppH1,T,tau)[2]
|
|
index_scanH1=ScanStat(ppH1,T,tau)[1]
|
|
index=Emp$index_scan
|
|
n=length(index)
|
|
if (scanH1< min(Emp$index_scan)){
|
|
return (c(scanH1,1,index_scanH1))
|
|
} else{
|
|
if(min(Emp$index_scan)<scanH1 && scanH1<=max(Emp$index_scan)){
|
|
return(c(scanH1,1-Emp$cdf[scanH1-min(Emp$index_scan)+1],index_scanH1))
|
|
} else{return (c(scanH1,0,index_scanH1))}}
|
|
}
|
|
```
|
|
|
|
### 2.2. Simulation under $\mathcal{H}_0$ and computation of p-values
|
|
On simule des séquences sous $\mathcal{H}_0$, que l'on stocke. On calcule la valeur de la scan stat et de la p-value, que l'on stocke aussi. On a une séquence de p-valeur des scans et une séquence de score local.
|
|
```{r}
|
|
NbSeqH0=1000
|
|
NbSeqH1=NbSeqH0
|
|
DataH0=vector("list")
|
|
DataH1=vector("list")
|
|
lambda0=3
|
|
lambda1=30
|
|
T=10
|
|
tau=1
|
|
|
|
#Creation of a sequence that contains the sequence simulated under the null hypothesis
|
|
for (i in 1:NbSeqH0) {
|
|
ppi=PoissonProcess(lambda0,T)
|
|
DataH0[[i]]=ppi
|
|
}
|
|
|
|
#Creation of a sequence that contains the sequence simulated under the alternative hypothesis
|
|
seqH1begin=c()
|
|
for (i in 1:NbSeqH1) {
|
|
pphi=SimulationH1(lambda0, lambda1,T,tau)
|
|
DataH1[[i]]=pphi[1]
|
|
seqH1begin=c(pphi[2],seqH1begin)
|
|
}
|
|
|
|
#Computation of the time between events
|
|
TimeBetweenEventList <- function(list,n_list){
|
|
TBE=vector("list",length=n_list)
|
|
for (i in (1:n_list)) {
|
|
ppi=list[[i]]
|
|
ni=length(ppi)
|
|
tbei=ppi[2:ni]-ppi[1:ni-1]
|
|
TBE[[i]]=tbei
|
|
}
|
|
return (TBE)
|
|
}
|
|
tbe0=TimeBetweenEventList(DataH0,NbSeqH0)
|
|
```
|
|
We compute the p-value associated to all 5 sequences, and stock them in a vector.
|
|
|
|
```{r}
|
|
#We start by computing the empirical distribution for lambda0
|
|
Emp=EmpDistrib(lambda0,n_sample,T,tau)
|
|
|
|
pvalue=c()
|
|
index_scan=c()
|
|
|
|
#Then, we stock the p-value and the
|
|
for (i in 1:NbSeqH0){
|
|
ppi=DataH0[[i]]
|
|
result=PValue(Emp,DataH0[[i]],T,tau)
|
|
scan=c(scan,result[1])
|
|
pvalue=c(pvalue,result[2])
|
|
index_scan=c(index_scan,result[3])
|
|
#cat(paste("\nSimulation for the sequence", i, ", for lambda0=",lambda0, " ,lambda1=", lambda1, " , scan=", result[1] ,"p-value=",result[2]))
|
|
#print(length(ppi))
|
|
}
|
|
ScS_H0=data.frame(num=1:NbSeqH0, scan_stat=scan, pvalue_scan=pvalue, class=(pvalue<0.05), begin_scan=index_scan)
|
|
sum(ScS_H0$class[which(ScS_H0$class==TRUE)])/NbSeqH0
|
|
```
|
|
|
|
```{r}
|
|
#We start by computing the empirical distribution for lambda0
|
|
scan=c()
|
|
pvalue=c()
|
|
index_scan=c()
|
|
|
|
#Then, we stock the p-value and the
|
|
for (i in 1:NbSeqH1){
|
|
ppi=DataH1[[i]]
|
|
result=PValue(Emp,DataH1[[i]],T,tau)
|
|
scan=c(scan,result[1])
|
|
pvalue=c(pvalue,result[2])
|
|
index_scan=c(index_scan,result[3])
|
|
#cat(paste("\nSimulation for the sequence", i, ", for lambda0=",lambda0, " ,lambda1=", lambda1, " , scan=", result[1] ,"p-value=",result[2]))
|
|
#print(length(ppi))
|
|
}
|
|
ScS_H1=data.frame(num=1:NbSeqH1, scan_stat=scan, pvalue_scan=pvalue, class=(pvalue<0.05), begin_scan=index_scan, begin_seq_H1=seqH1begin)
|
|
sum(ScS_H1$class[which(ScS_H0$class==TRUE)])/NbSeqH1
|
|
ScS_H1
|
|
```
|
|
|
|
## 3. Local score
|
|
### Distribution of scores via Monte Carlo
|
|
```{r}
|
|
# Calcul du choix de E
|
|
E = 1
|
|
maxXk = floor(E*(log(lambda1/lambda0)))
|
|
while (maxXk < 3) {
|
|
E = E+1
|
|
maxXk = floor(E*(log(lambda1/lambda0)))
|
|
}
|
|
|
|
ppH0 = PoissonProcess(lambda0,10^4)
|
|
n1 = length(ppH0)
|
|
tbe0 = ppH0[2:n1]-ppH0[1:n1-1]
|
|
print(ks.test(tbe0, 'exp'))
|
|
|
|
xp = floor(E*(log(lambda1/lambda0)+(lambda0-lambda1)*tbe0)) # ne pas mettre le floor ni le E (certes égale à 1)
|
|
|
|
min_X = min(xp)
|
|
max_X = max(xp)
|
|
|
|
vect.score = min_X:max_X
|
|
|
|
P_X = table(factor(xp, levels = min(xp):max(xp)))
|
|
P_X = P_X/sum(table(xp))
|
|
|
|
Mean_xp = sum(vect.score*P_X)
|
|
print(Mean_xp)
|
|
|
|
#print(dist.theo.scores) # Mettre à jour avec Elisa
|
|
#print(P_X)
|
|
```
|
|
|
|
```{r}
|
|
ComputeE <- function(lambda0, lambda1){
|
|
E = 1
|
|
maxXk = floor(E*(log(lambda1/lambda0)))
|
|
while (maxXk < 3) {
|
|
E = E+1
|
|
maxXk = floor(E*(log(lambda1/lambda0)))
|
|
}
|
|
|
|
return (E)
|
|
}
|
|
```
|
|
|
|
```{r}
|
|
ScoreDistrib <- function(lambda0, lambda1, T){
|
|
E = ComputeE(lambda0, lambda1)
|
|
|
|
ppH0 = PoissonProcess(lambda0,T)
|
|
n1 = length(ppH0)
|
|
tbe0 = ppH0[2:n1]-ppH0[1:n1-1]
|
|
# print(ks.test(tbe0, 'exp'))
|
|
|
|
X = floor(E*(log(lambda1/lambda0)+(lambda0-lambda1)*tbe0)) # ne pas mettre le floor ni le E (certes égale à 1)
|
|
|
|
min_X = min(X)
|
|
max_X = max(X)
|
|
|
|
vect.score = min_X:max_X
|
|
|
|
P_X = table(factor(X, levels = min_X:max_X))
|
|
P_X = P_X/sum(table(X))
|
|
|
|
return (list("X" = X, "P_X" = P_X))
|
|
}
|
|
```
|
|
|
|
### Local score calculation
|
|
```{r}
|
|
LocaScoreMC <- function(lambda0, lambda1, E, T){
|
|
pvalue = c()
|
|
X = c()
|
|
|
|
Score = ScoreDistrib(lambda0, lambda1, T)
|
|
xp = Score$X
|
|
P_X = Score$P_X
|
|
|
|
min_X = min(xp)
|
|
max_X = max(xp)
|
|
|
|
for (i in 1:NbSeqH0){
|
|
x = floor(E*log(dexp(tbe0[[i]], rate = lambda1)/dexp(tbe0[[i]], rate = lambda0)))
|
|
X = c(X,x)
|
|
LS = localScoreC(x)$localScore[1]
|
|
|
|
daudin_result = daudin(localScore = LS, score_probabilities = P_X, sequence_length = length(x), sequence_min = min_X, sequence_max = max_X)
|
|
|
|
pvalue = c(pvalue, daudin_result)
|
|
}
|
|
LS_H0=data.frame(num=1:NbSeqH0, pvalue_scan=pvalue, class=(pvalue<0.05))
|
|
return(LS_H0)
|
|
}
|
|
```
|
|
|
|
### Experience plan
|
|
```{r}
|
|
E = 10
|
|
for (T in 10**(2:5)){
|
|
for (lambda0 in c(1, 2, 10)){
|
|
for (lambda1 in c(4, 20, 100, 1000)){
|
|
if (lambda0 < lambda1){
|
|
cat("T = ", T, ", lambda0 = ", lambda0, ", lambda1 = ", lambda1, "\n", sep = "")
|
|
LS_H0 = LocaScoreMC(lambda0, lambda1, E, T)
|
|
print(summary(LS_H0))
|
|
cat("---\n")
|
|
}
|
|
}
|
|
}
|
|
}
|
|
```
|