#创建新变量并保存到原有数据框
#way 1
d = data.frame(x1=1:4,x2=2:5)
d$sum = d$x1+d$x2
#way 2
d= transform(d,sum=x1+x2,meanx=(x1+x2)/2)
#变量的重编码
d = within(d,{
x1Less2 = NA
x1Less2[x1<=2] = "y"
x1Less2[x1>2] = "n"
})
#变量的重命名
fix(d)
names(d)[c(3,4)] = c("sumX","menaX")
#缺失值
y = c(1:3,NA)
is.na(y)
sum(y,na.rm=TRUE)
na.omit(y)
#日期值
mydate = as.Date(c("2007-06-22","2004-02-13"))
mydate
class(mydate)
strDates = c("01/05/1965","08/16/1975")
dates = as.Date(strDates,"%m/%d/%T")
dates
today = Sys.Date()
format(today,format="%m %d %Y")
format(today,format="%A %a %B %b %Y %y")
date()
startdate = as.Date("1993-08-22")
days = today-startdate
days
difftime(today,startdate,units="weeks")
#类型转换
is.numeric
is.character
is.factor
is.logical
is.vector
#数据排序
d = data.frame(c1=rep(1:3,imes=3),c2=seq(1,9))
index = order(d$c1,d$c2)
index
d[index,]
#数据集的合并
##添加列
total = merge(dA,dB,by="id")
total = cbind(a,b)
#添加行
total = rbind(a,b)
#数据集取子集
d = head(airquality)
#保留变量
d[,c(2,3)]
d[c(2,3)]
d[c("Solar.R","Wind")]
#丢弃变量
d[c(-2,-3)]
d[! names(d) %in% c("Solar.R", "Wind")]
d$Solar.R = d$Wind = NULL
#选入观测
d[which(d$Ozone==41),]
#subset函数
subset(d,Ozone>20 | Day==4,select=Wind:Day)
#随机抽样
d[sample(1:nrow(d),5,replace=FALSE),]
#使用sql语句操纵数据框
install.packages("sqldf")
library(sqldf)
newdf = sqldf("select * from mtcars where carb=1 order by mpg",
row.names=TRUE)
sqldf("select avg(mpg) as avg_mpg, avg(disp) as avg_disp, gear
from mtcars where cyl in (4,6) group by gear")