if(!require(stringr))install.packages('stringr')
library(stringr)
x <- "The birch canoe slid on the smooth planks."
x
[1] "The birch canoe slid on the smooth planks."
str_length(x)
[1] 42 ### 1.检测字符串长度,包含空格和符号
length(x) #返回的是向量中元素的个数,即x中只有一个元素
[1] 1
y = c("jimmy 150","nicker 140","tony 152")
#y是字符型向量,由三个字符串组成的向量
#“jimmy 150”:字符串,一个引号中所有的东西
#引号内的单个字母/数字/符号:字符
>str_split(x," ") #以空格为分隔符号把x拆分开
[[1]] #[[1]] 列表
[1] "The" "birch" "canoe" "slid"
[5]"on" "the" "smooth" "planks."
> x2 = str_split(x," ")[[1]];x2 #拆掉[[1]]
[1] "The" "birch" "canoe" "slid" "on" "the"
[7] "smooth" "planks."
> y = c("jimmy 150","nicker 140","tony 152")
> str_split(y," ")
[[1]] #列表1
[1] "jimmy" "150"
[[2]]
[1] "nicker" "140"
[[3]]
[1] "tony" "152"
> str_split(y," ",simplify = T) #和上个代码区别一下
> [ ,1] [ ,2]
[1,] "jimmy" "150"
[2,]"nicker" "140"
[3,] "tony" "152"
str_sub(x,5,9) #提取从第5位到第9位
[1] "birch"
> str_detect(x2,"h") #看看x2中是否含有“h”
[1] TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE
> str_starts(x2,"T") #看看x2中是否以“T”开头
[1] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
> str_ends(x2,"e") #看看x2中是否以“e”结尾
[1] TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE
> x2
[1] "The" "birch" "canoe" "slid" "on" "the"
[7] "smooth" "planks."
> str_replace(x2,"o","A") #把x2中的o替换成A
[1] "The" "birch" "canAe" "slid" "An" "the"
[7] "smAoth" "planks."
> str_replace_all(x2,"o","A") #把x2中的所有o替换成A
[1] "The" "birch" "canAe" "slid" "An" "the"
[7] "smAAth" "planks."
> x
[1] "The birch canoe slid on the smooth planks."
> str_remove(x," ") #删除x中的第一个空格
[1] "Thebirch canoe slid on the smooth planks."
> str_remove_all(x," ") #删除x中的所有空格
[1] "Thebirchcanoeslidonthesmoothplanks."
#加载数据
> test <- iris[c(1:2,51:52,101:102),]
> rownames(test) =NULL # 去掉行名,NULL是“什么都没有”
> test
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 7.0 3.2 4.7 1.4 versicolor
4 6.4 3.2 4.5 1.5 versicolor
5 6.3 3.3 6.0 2.5 virginica
6 5.8 2.7 5.1 1.9 virginica
> library(dplyr)
> arrange(test, Sepal.Length) #arrange() 以“Sepal.Length”这一列从小到大(默认)排序 列名大多数不需要加“”
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 4.9 3.0 1.4 0.2 setosa
2 5.1 3.5 1.4 0.2 setosa
3 5.8 2.7 5.1 1.9 virginica
4 6.3 3.3 6.0 2.5 virginica
5 6.4 3.2 4.5 1.5 versicolor
6 7.0 3.2 4.7 1.4 versicolor
> arrange(test, desc(Sepal.Length)) #以“Sepal.Length”这一列从大到小排序
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 7.0 3.2 4.7 1.4 versicolor
2 6.4 3.2 4.5 1.5 versicolor
3 6.3 3.3 6.0 2.5 virginica
4 5.8 2.7 5.1 1.9 virginica
5 5.1 3.5 1.4 0.2 setosa
6 4.9 3.0 1.4 0.2 setosa
> distinct(test,Species,.keep_all = T) #按照species这一列去重复,只保留第一次出现的值 .keep_all = T(把其余的列对应的值都保留下来)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 7.0 3.2 4.7 1.4 versicolor
3 6.3 3.3 6.0 2.5 virginica
> mutate(test, new = Sepal.Length * Sepal.Width)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species new
1 5.1 3.5 1.4 0.2 setosa 17.85
2 4.9 3.0 1.4 0.2 setosa 14.70
3 7.0 3.2 4.7 1.4 versicolor 22.40
4 6.4 3.2 4.5 1.5 versicolor 20.48
5 6.3 3.3 6.0 2.5 virginica 20.79
6 5.8 2.7 5.1 1.9 virginica 15.66
#注意此时test仍为5列 因为没有赋值
> test$new=test$Sepal.Length*test$Sepal.Width
#以这种方式才是真正的添加新的一列
-------注意:R语言中的修改,都要赋值,没有赋值就没有发生过-------
x1 = filter(iris,Sepal.Width>3) #从iris出发先筛选行
x2 = select(x1, Sepal.Length,Sepal.Width) #再筛选列
x3 = arrange(x2,Sepal.Length) #再进行排序
# 2.管道符号传递,简洁明了
x = iris %>% #control+shift 代表把前面的数据传递给后面的函数的第一个参数 省掉多次赋值
filter(Sepal.Width>3) %>%
select(Sepal.Length,Sepal.Width)%>%
arrange(Sepal.Length)
#3. 嵌套,代码不易读
arrange(select(filter(iris,Sepal.Width>3),
Sepal.Length,Sepal.Width),
Sepal.Length)
load('test1.Rdata')
library(dplyr)
x=arrange(dat,logFC);head(x)
> a=read.csv("../R_04/group.csv")
> a
ID title
1 GSM1052615 A375 cells 24h Control rep1
2 GSM1052616 A375 cells 24h Control rep2
3 GSM1052617 A375 cells 24h Control rep3
4 GSM1052618 A375 cells 24h Vemurafenib rep1
5 GSM1052619 A375 cells 24h Vemurafenib rep2
6 GSM1052620 A375 cells 24h Vemurafenib rep3
> g=str_split(a$title," ",simplify = T)
> g
[,1] [,2] [,3] [,4] [,5]
[1,] "A375" "cells" "24h" "Control" "rep1"
[2,] "A375" "cells" "24h" "Control" "rep2"
[3,] "A375" "cells" "24h" "Control" "rep3"
[4,] "A375" "cells" "24h" "Vemurafenib" "rep1"
[5,] "A375" "cells" "24h" "Vemurafenib" "rep2"
[6,] "A375" "cells" "24h" "Vemurafenib" "rep3"
> g[,4]
[1] "Control" "Control" "Control" "Vemurafenib"
[5] "Vemurafenib" "Vemurafenib"
x=merge(dat,ids,by = "probe_id")
x2=inner_join(dat,ids,by = "probe_id")
> tolower(g[,4])
[1] "control" "control" "control" "vemurafenib"
[5] "vemurafenib" "vemurafenib"
#或者
>str_to_lower(g[,4])
#### (1)只有if没有else,那么条件是FALSE时就什么都不做
> i = -1
> if (i<0) print('up')
[1] "up"
> if (i>0) print('up')
> #理解下面代码
> if(!require(tidyr)) install.packages('tidyr')
i =1
if (i>0){
print('+')
} else {
print("-")
}
x:逻辑值或逻辑值向量;
yes:逻辑值为T时的返回值;
no:逻辑值为F时的返回值
i = 1
ifelse(i>0,"+","-")
> ifelse(i>0,"+","-")
[1] "+"
> x = rnorm(3)
> x
[1] 0.5108486 0.7382251 -2.0977172
> ifelse(x>0,"+","-")
[1] "+" "+" "-"
#ifelse()+str_detect(),王炸
> samples = c("tumor1","tumor2","tumor3","normal1","normal2","normal3")
> k1 = str_detect(samples,"tumor");k1 #看samples里面有哪些是包含tumor的
[1] TRUE TRUE TRUE FALSE FALSE FALSE
> ifelse(k1,"tumor","normal") #ifelse()是用来替换的把T替换成tumor输出,F替换成normal输出
[1] "tumor" "tumor" "tumor" "normal" "normal" "normal"
#换一种写法,检测normal
> k2 = str_detect(samples,"normal");k2
[1] FALSE FALSE FALSE TRUE TRUE TRUE
> ifelse(k2,"normal","tumor")
[1] "tumor" "tumor" "tumor" "normal" "normal" "normal"
-----注意:yes和no的位置一定不能反-----
i = 0
if (i>0){
print('+')
} else if (i==0) {
print('0')
} else if (i< 0){
print('-')
}
[1] "0"
> ifelse(i>0,"+",ifelse(i<0,"-","0")) #嵌套写法 先里后外
[1] "0"
#variable是元素代称;vector是向量名字 { }中是对元素进行操作的
x=c(1,5,7,3)
for (i in x) {
print(i)
}
[1] 1
[1] 5
[1] 7
[1] 3
# i 分别等于1,5,7,3 输出为1,5,7,3
x=c(1,5,7,3)
for (i in x) {
print(rnorm(i))
}
[1] 1.341562
[1] 0.38916084 0.38170028 0.04550988 1.85500644 0.33939643
[1] -1.348085038 1.011728972 0.951139408 -1.109823986 -0.479132462
[6] -0.134466769 -0.007759876
[1] -0.1804468 -1.4743407 -0.2882833
# i 分别等于1,5,7,3,相对应在rnorm中取1,5,7,3个数
#例子
> x <- c(5,6,0,3)
> s = 0
> for (i in 1:length(x)){
+ s=s+x[[i]]
+ print(c(x[[i]],s))
+ }
[1] 5 5
[1] 6 11
[1] 0 11
[1] 3 14
#3.加载deg.Rdata,根据a、b两列的值,按照以下条件生成向量x:
#a< -1 且b<0.05,则x对应的值为down;
#a>1 且b<0.05,则x对应的值为up;
#其他情况,x对应的值为no
#统计up、down、no各重复了多少次
load("deg.Rdata")
k1 = deg$a< -1 & deg$b<0.05;table(k1) #不能直接写a(它是一个列名,要用$)
k1
FALSE TRUE
26094 4681
k2 = deg$a>1 & deg$b<0.05;table(k2)
k2
FALSE
30775
x = ifelse(k1,"down",ifelse(k2,"up","no"))
table(x) # 统计up、down、no各重复了多少次
x
down no
4681 26094
x <- c(5,6,0,3) #元素循环 对x元素进行循环
s=0
for (i in x){
s=s+i
print(c(i,s))
}
[1] 5 5
[1] 6 11
[1] 0 11
[1] 3 14
x <- c(5,6,0,3)
s = 0
for (i in 1:length(x))} #下标循环(元素的长度)1:length(x)=1:4
s=s+x[[i]] #把i写成对应的元素即可x[[i]]
print(c(x[[i]],s))
}
[1] 5 5
[1] 6 11
[1] 0 11
[1] 3 14
s = 0
result = list() #先生成一个listlist()
for(i in 1:length(x)){
s=s+x[[i]]
result[[i]] = c(x[[i]],s). #每次都在列表里面加一个元素
}
result #一列表的形式储存了结果
[[1]]
[1] 5 5
[[2]]
[1] 6 11
[[3]]
[1] 0 11
[[4]]
[1] 3 14
do.call(cbind,result) #将列表组成为矩阵
[,1] [,2] [,3] [,4]
[1,] 5 6 0 3
[2,] 5 11 11 14
#do.call 将列表里的元素进行批量的操作;cbind是按列拼接起来(向量长度得是相同的)
if(T){} #运行{}中的代码;可折叠
if(F){} #跳过{}中的代码
set.seed(10086) #设置随机种子,保证每次运行的随机值(与其他人以及每次登录时)都 一样
exp = matrix(rnorm(18),ncol = 6) #18个随机数分布在六列里形成矩阵
exp = round(exp,2) #round() 取小数点后2位数字
rownames(exp) = paste0("gene",1:3) #矩阵加上行名
colnames(exp) = paste0("test",1:6) #矩阵加上列名
exp[,1:3] = exp[,1:3]+1 #在前三列原有的基础上加上1
exp
library(tidyr)
library(tibble)
library(dplyr)
dat = t(exp) %>% #数据转置 行列互换
as.data.frame() %>% #矩阵转换为数据框
rownames_to_column() %>% #把行名变成数据框里的一列
mutate(group = rep(c("control","treat"),each = 3)) #加上分组3个control 和三个treat 前三行control后三行treat
pdat = dat%>%
pivot_longer(cols = starts_with("gene"), #宽变长函数 col=是指把gene合并成一列
names_to = "gene", #宽边长 原来的列名出来对应的数据名字
values_to = "count") #原来数值对应的一列 新的命名
library(ggplot2)
p = ggplot(pdat,aes(gene,count))+
geom_boxplot(aes(fill = group))+
theme_bw() #取掉灰色背景
p
p + facet_wrap(~gene,scales = "free") #分区
#scales = “fixed” x和y的标度在所用平面中都相同
scales = “free” x和y的标度在每个版面都可以变化
scales = “free_x 固定y轴,x轴自由变化
scales = “free_y” 类似,同上
apply(x,MARGIN,FUN...) #X 是数据框/矩阵名;MARGING为1表示行;为2表示列,FUN是函数 ;对x的每一行/列进行FUN这个函数;...用于写函数的参数
> test<- iris[1:6,1:4]
> apply(test, 2, mean)
Sepal.Length Sepal.Width Petal.Length Petal.Width
4.9500000 3.3833333 1.4500000 0.2333333
> apply(test, 1, sum)
1 2 3 4 5 6
10.2 9.5 9.4 9.4 10.2 11.4
#从小到大排序,取最后的5个
sort(a)[16:20]
head(a)#默认取前6个数
head(a,5)#取前5个
head(sort(a,decreasing = T),5)#与下面结果一致
tail(sort(a),5)#从小到大排序,取最后的5个
#计算每个基因的方差 每个行
#每个基因的方差排序 每行计算方差
#最后1000个数字对应的基因
load("test2.Rdata")
tail(sort(apply(test,1,var)),1000) #每个基因的方差排序 每行计算方差,取后1000个
names(tail(sort(apply(test,1,var)),1000)) # 将后1000个的基因名称提取出来
> test <- list(x = 36:33,y = 32:35,z = 30:27);test
$x
[1] 36 35 34 33
$y
[1] 32 33 34 35
$z
[1] 30 29 28 27
> lapply(test,mean) #输出结果仍为列表
$x
[1] 34.5
$y
[1] 33.5
$z
[1] 28.5
> lapply(test,fivenum)
$x
[1] 33.0 33.5 34.5 35.5 36.0
$y
[1] 32.0 32.5 33.5 34.5 35.0
$z
[1] 27.0 27.5 28.5 29.5 30.0
> sapply(test,mean)
x y z
34.5 33.5 28.5
> sapply(test,fivenum) #fivenum():返回五个数据:最小值、下四分位数、中位数、上四分位数、最大值
x y z
[1,] 33.0 32.0 27.0
[2,] 33.5 32.5 27.5
[3,] 34.5 33.5 28.5
[4,] 35.5 34.5 29.5
[5,] 36.0 35.0 30.0
> class(sapply(test,fivenum))
[1] "matrix" "array"
#数据载入
> test1 <- data.frame(name = c('jimmy','nicker','Damon','Sophie'),
+ blood_type = c("A","B","O","AB"))
> test1
name blood_type
1 jimmy A
2 nicker B
3 Damon O
4 Sophie AB
> test2 <- data.frame(name = c('Damon','jimmy','nicker','tony'),
+ group = c("group1","group1","group2","group2"),
+ vision = c(4.2,4.3,4.9,4.5))
> test2
name group vision
1 Damon group1 4.2
2 jimmy group1 4.3
3 nicker group2 4.9
4 tony group2 4.5
> View(test)
> library(dplyr)
Attaching package: ‘dplyr’
> inner_join(test1,test2,by="name") #交集连接
name blood_type group vision
1 jimmy A group1 4.3
2 nicker B group2 4.9
3 Damon O group1 4.2
> right_join(test1,test2,by="name") #右连接,以右边的name为准将两个数据框进行连接,有右边名字的连接上,没有的不连接
name blood_type group vision
1 jimmy A group1 4.3
2 nicker B group2 4.9
3 Damon O group1 4.2
4 tony <NA> group2 4.5
> full_join(test1,test2,by="name") #全连接,都连接上,没有的写NA
name blood_type group vision
1 jimmy A group1 4.3
2 nicker B group2 4.9
3 Damon O group1 4.2
4 Sophie AB <NA> NA
5 tony <NA> group2 4.5
> semi_join(test1,test2,by="name") #半连接,基本用不上 把test1中的name也在test2中包含的人取出来 %in%
name blood_type
1 jimmy A
2 nicker B
3 Damon O
> anti_join(test1,test2,by="name") #反连接,基本用不上 把test1中的name不在test2中包含的人取出来
name blood_type
1 Sophie AB
match()# 向量进行匹配
dir()#
file.create()
file.exists()
file.remove()
sort()/table()/length()
unique()/duplicated()
names()
ifelse和str_detect() #分组
arrange() # 排序
distinct() #去重-数据框
merge() #连接
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。