r 在R中创建示例data.frame

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了r 在R中创建示例data.frame相关的知识,希望对你有一定的参考价值。

### Function to Generate a Random Data Set
#   an adaption of
#   https://trinkerrstuff.wordpress.com/2012/05/02/function-to-generate-a-random-data-set
#   Posted on 2 May, 2012	by tylerrinker

#   The props Function
#   The props function generates a data frame of proportions whose rows sum to 1.  It takes two arguments and an optional var.names argument.  The first two arguments are the dimensions of the dataframe and are pretty self explanatory.  The final argument optionally names the columns otherwise they are named X1..Xn.  One note on this function is that for many columns it is a poorer choice.  For a slower props function but better for numerous columns Dason of talkstats.com provides an alternative (props2).
#############################################################
# function to generate random proportions whose rowSums = 1 #
#############################################################
props <- function(ncol, nrow, var.names=NULL)
{
    if (ncol < 2) stop("ncol must be greater than 1")
    p <- function(n){
        y <- 0
        z <- sapply(seq_len(n-1), function(i) {
                x <- sample(seq(0, 1-y, by=.01), 1)
                y <<- y + x
                return(x)
            }
        )
        w <- c(z , 1-sum(z))
        return(w)
    }
    DF <- data.frame(t(replicate(nrow, p(n=ncol))))
    if (!is.null(var.names)) colnames(DF) <- var.names
    return(DF)
}
#   or
props2 <- function(nrow=10, ncol=5, var.names=NULL, digits=2)
{     
    p <- function(n, digits){                                      
        tmp <- rgamma(n, 1, 1)                                     
        X <- round(tmp/sum(tmp), digits=digits)                    
        if (sum(X)!=1) {                                           
            o <- diff(c(1, sum(X)))                                
            X[which.max(X)] <- max(X)-o                            
        }                                                          
        return(X)                                                  
    }                                                              
    DF <- data.frame(t(replicate(nrow, p(n=ncol, digits=digits)))) 
    if (!is.null(var.names)) colnames(DF) <- var.names             
    return(DF)                                                     
}

#   The NAins Function
#   The NAins function takes a data frame and randomly inserts a certain proportion of missing (NA) values.  The function has two arguments: df which is the dataframe and prop which is the proportion of NA values to be inserted into the data frame (default is .1),
#   Special thanks again to Dason of talk.stats.com for helping with a speed boost with this function.  This function consumes considerable time in DFgen and he provided the code to really gain some speed.
################################################################
# RANDOMLY INSERT A CERTAIN PROPORTION OF NAs INTO A DATAFRAME #
################################################################
NAins <-  NAinsert <- function(df, prop = .1){
    n <- nrow(df)
    m <- ncol(df)
    num.to.na <- ceiling(prop*n*m)
    id <- sample(0:(m*n-1), num.to.na, replace = FALSE)
    rows <- id %/% m + 1
    cols <- id %% m + 1
    sapply(seq(num.to.na), function(x){
            df[rows[x], cols[x]] <<- NA
        }
    )
    return(df)
}

#   The DFgen Function
#   The DFgen function randomly generates an n-lenght data set with predefined variables.
############################################################
# GENERATE A RANDOM DATA SET.  CAN BE SET TO LONG OR WIDE. #
# DATA SET HAS FACTORS AND NUMERIC VARIABLES AND CAN       #
# OPTIONALLY GIVE BUDGET EXPENDITURES AS A PROPORTION.     #
# CAN ALSO TELL A PROPORTION OF CELLS TO BE MISSING VALUES #
############################################################
# NOTE RELIES ON THE props FUNCTION AND THE NAins FUNCTION #
############################################################
DFgen <- DFmaker <- function(n=10, type=wide, digits=2, 
    proportion=FALSE, na.rate=0) 
{
    rownamer <- function(dataframe){
        x <- as.data.frame(dataframe)
        rownames(x) <- NULL
        return(x)
    }

    dfround <- function(dataframe, digits = 0){
      df <- dataframe
      df[,sapply(df, is.numeric)] <-round(df[,sapply(df, is.numeric)], digits) 
      return(df)
    }

    TYPE <- as.character(substitute(type))
    time1 <- sample(1:100, n, replace = TRUE) + abs(rnorm(n))
    DF <- data.frame( id = paste( "ID.", 1:n, sep = "" ), 
        group= sample(c("control", "treat"), n, replace = TRUE),
        hs.grad = sample(c("yes", "no"), n, replace = TRUE), 
        race = sample(c("black", "white", "asian"), n, 
            replace = TRUE, prob=c(.25, .5, .25)), 
        gender = sample(c("male", "female"), n, replace = TRUE), 
        age = sample(18:40, n, replace = TRUE),
        m.status = sample(c("never", "married", "divorced", "widowed"), 
            n, replace = TRUE, prob=c(.25, .4, .3, .05)), 
        political = sample(c("democrat", "republican", 
            "independent", "other"), n, replace= TRUE, 
            prob=c(.35, .35, .20, .1)),
        n.kids = rpois(n, 1.5), 
        income = sample(c(seq(0, 30000, by=1000), 
            seq(0, 150000, by=1000)), n, replace=TRUE),
        score = rnorm(n), 
        time1, 
        time2 = c(time1 + 2 * abs(rnorm(n))), 
        time3 = c(time1 + (4 * abs(rnorm(n)))))
    if (proportion) {
        DF <- cbind (DF[, 1:10], 
            props(ncol=3, nrow=n, var.names=c("food", 
                "housing", "other")),
            DF[, 11:14])
    }
    if (na.rate!=0) {  
        DF <- cbind(DF[, 1, drop=FALSE], NAins(DF[, -1], 
            prop=na.rate))
    }
    DF <- switch(TYPE, 
        wide = DF, 
        long = {DF <- reshape(DF, direction = "long", idvar = "id",
                varying = c("time1","time2", "time3"),
                v.names = c("value"),
                timevar = "time", times = c("time1", "time2", "time3"))
            rownamer(DF)}, 
        stop("Invalid Data \"type\""))
    return(dfround(DF, digits=digits))
}

以上是关于r 在R中创建示例data.frame的主要内容,如果未能解决你的问题,请参考以下文章

在 R 中创建类似 sgplot 的离散热图

R: 从条件列表中创建指标列。

你如何在 R 中创建一个虚拟数据集? [复制]

在 R 中创建边列表

将 R data.frame/tbl 导出到 Google BigQuery 表

有没有更好的方法在 R 中创建分位数“虚拟”/因子?