可以不学习stata，只学习r语言吗

Posted 2023-04-04

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了可以不学习stata，只学习r语言吗相关的知识，希望对你有一定的参考价值。

参考技术A AnintroductiontoR：全面系统地介绍R语言，适合作为初步的参考资料。该资料是一份pdf文档，也是R语言官方手册。TryR:强烈推荐，非常简短地课程，可以在网页上进行简短的操作。该网站提供R的网页操作，所以你无需安装R,从最基本的R语言开始学期，通过实际操作掌握R的相关知识。ComputingforDataAnalysis：大约四周的视频课程。IntroductiontoRforDataMining:R进行数据挖掘方面的材料，包括一些ppt和视频资料Rstudio:R语言的集成操作环境，强烈建议安装。Rstudio会让你的工作效率指数提高。GettingstartedwithRandHadoop,关于R和Hadoop项目的资料。ggplot2:R绘图神器，该网站提供所有关于ggplot2的命令分解和介绍，同时配有大量的案例。LearningTimeSerieswithR：关于R的时间序列分析的资料。参考技术B /* Lecture 1： How to import and export data */
//备注不长于一行可用双斜杠
sysuse "auto.dta",clear
//导入系统自带的数据，clear：关闭之前所有的数据
//字符串变量：红颜色的文字变量。蓝颜色的domestic对应0，foreign对于1
replace make="AMC" in 1
//export data
cd"D:\BaiduNetdiskDownload\stata application"
help export delimited /*stata 自带的帮助文档，只导出csv和txt格式*/
/*导出其中部分变量
export delimited [你想导出的变量名称] using filename [if] [in] [,
export_delimited_options]*/
export delim using"auto.txt",replace
export delim make price using "auto.csv",replace
//MAKE 和price是变量名，只导出这两个变量
help import excel
export excel using "auto.xlsx" in 11/L
//in:第几行到第几行（L表示最后一行）
export excel using "auto.xlsx" if price>=4000,replace
//if: 满足某种条件的变量
export excel using "auto.xlsx" firstrow(var),replace
//import txt file
cd"D:\BaiduNetdiskDownload\stata application"

use "D:\BaiduNetdiskDownload\stata application\Data_luanma.dta"
help unicode
clear
unicode encoding set gb18030
unicode translate "Data_luanma.dta"
use "Data_luanma.dta",clear
//打开大数据
set excelxlsxlargefile on
//允许stata打开大文件
import excel " "

//如果内存太小，如何拆分大文件
findit chunky
help chunky

cd"D:\BaiduNetdiskDownload\stata application"
chunky using "数据文件名称", peek(1)
//看一下数据，不导入，（1）表示第一行
chunky using 数据文件名称，analyze
//分析
chunky using 数据文件名称，chunksize(10m) header(include) stub(数据文件名称）replace
//按10m的大小拆分文件
import delimited using "其中一个数据文件名称”，clear encoding（数据）
//调用拆分后的其中一个数据

sysuse auto.dta,clear
//调用系统数据
//拆分数据
export delim "auto1.csv" in 1,replace
export delim "auto2.csv" in 2, replace
export delim "auto3.csv" in 3,replace
export delim "auto4.csv" in 4, replace
export delim "auto5.csv" in 5/L, replace
help foreach
foreach num of numlist 1/3 5 8 9(10) 100
display `num'

//重新导入数据
foreach num of numlist 1/74
display "auto `num'"
import delim "auto`num'.csv", clear
save "auto`num'.dta", replace

help append
append using "auto2.csv" "auto3.csv""auto4.csv"/*
*/"auto5.csv"
use "auto1.dta", clear
foreach num of numlist 2/5
append using "auto`num'.dta"

save"auto_new.dta", replace
ssc install openall
findit openall
openall, auto?, insheet
//删除多余文件
foreach x of numlist 1/5
erase"auto`x'.csv"

//lecture 3
cd "D:\BaiduNetdiskDownload\stata application"
help format
sysuse auto.dta
//改变字符串的长度，为了方便浏览
format %30s make
//浏览
edit make
//显示字符串和数值变量
edit make price headroom
format %-20s make
//小数点后面两位数
format %3.2f headroom
//
format %10.0g price
//在result窗口展示数据
list make price headroom in 1/10
//变量的标签
sysuse auto.dta, clear
//介绍数据集的基本情况
describe
//改变整个数据的标签
label data "US auto data美国汽车数据"
//改变变量标签
label var price "auto price汽车价格"
//修改虚拟变量标签，先定义后标签
label define origin_v 0 "国产" 1 "进口"
label values foreign origin_v
describe
replace foreign = 2 in 1/8
//每一次定义新标签要重新命名origin
label define origin_new2 0 "国产" 1 "进口" 2 "unknown"
//显示
list make if foreign == 0 //一个等号表示赋值
list make price if make =="AMC Concord" /*
*/ | make =="Merc. Cougar"
list make foreign price if (foreign == 1& price <=5000) /*
*/ | (foreign == 0 & price >3000)
list make price if inlist (make, "AMC Concord" "Merc. cougar", )
sysuse auto.dta , clear
export excel using "auto.xlsx", nolabel replace
import excel using "auto.xlsx", clear
describe
//修改变量名称
rename
rename A price
rename _all, proper//让变量首字母大写，剩下字母小写
//将变量名称批量写入标签，重要的是所有的变量循环_all
foreach v of varlist _all
label variable `v' " `v' "

//shenme yisi
generate//建立新变量
replace//更改已有变量

cd "D:\BaiduNetdiskDownload\stata application"
sysuse auto.dta, clear
gen price2=price^2 //生产price2的平方
gen price_mpg = price*mpg if foreign == 1 //生成price 和mpg的交叉项如果foreign=1

replace price_mpg =0 if price_mpg == .
//用0取代缺失值
gen logprice = log(price) //生产price的对数指
gen lnprice = ln(pice) //生成price的自然对数指，其实和上行生成的结果一样的
//出现零的时候取对数会成为missing数据，可能会丢失数据
replace price_mpg = price*mpg //生成price和mpg的交叉项，并取代price_mpg变量
gen pricecateg = 0//生产pricecateg变量，并将数据分组
replace pricecateg = 1 if price >=5000 & price < 10000
replace pricecateg = 2 if price >= 10000
0
edit price pricecateg
label define category 0 "less than 5k" 1 "between 5k and 10k" 2 "more than 10k"
label values pricecateg category
//重命名
edit price pricecateg
help egen
egen priceavg3 = mean(price)
gen price_dev = price - priceavg3
//如何分别计算foreign和domestic的均值
sort foreign
//方法一
egen price_avg = mean(price) if foreign == 1
egen price_avg2 = mean(price) if foreign == 0
replace price_avg = price_avg2 if foreign == 0
drop price_avg2
//方法二
by foreign: egen priceavg_by = mean(price)//按照foreign的分类做均值
sort foreign //给foreign按数据大小排序
help tostring // 数值变量变字符串
help destring //字符串变数值变量,字符串不可以做运算的
sysuse auto.dta, clear
edit mpg
tostring mpg, gen(mpg_str)
tostring mpg, replace force//不想产生新的变量，有mpg取代原来的
edit mpg mpg_str
destring mpg_str, replace force//将字符串变数值
edit mpg mpg_str
sysuse auto.dta, clear
encode make, gen(make_num)//将文字变量重新编码成数字
edit make_num make

sysuse auto.dta, clear
//产生虚拟变量
gen dummy_high = 0//生成一个全是零的新变量
replace dummy_high = 1 if price>= 10000//用1取代价格大于某数的dummy high
//另一种方法
gen indicator_hi = (price>10000)//满足括号里面条件的为1
//展示出两个变量不一样的地方（check的方法）
edit price dummy_high indicator_hi
edit dummy_high indicator_hi if dummy_high ~=indicator_high
sum dummy_hgh indicator_high
recode foreign (0=1) (1=2), gen (for_new)

//计算price的四分卫区间， 25%， 50%， 75%
egen price_pc25 = pctile(price),p(25)
egen price_pc50 = pctile(price),p(50)
egen price_pc75 = pctile(price),p(75)//分别计算这这点的数值是多少
gen price_4cat=0
replace price_4cat=1 if price >=price_pc25&price<price_pc50
replace price_4cat=2 if price >=price_pc50&price<price_pc75
replace price_4cat=3 if price >price_pc75

/*数据合并
append merge joinby*/
//数据纵向合并
sysuse auto.dta,clear
keep if foreign == 0//只保留国产的数据
save auto_domestic.dta, replace
sysuse auto.dta, clear
keep if foreign == 1
save auto_foreign.dta,replace
append using auto_domestic.dta//合并
//数据横向合并
sysuse auto.dta, clear
gen id = _n//给横向排序（车型号）
keep make id mpg weight length
save auto_tech.dta, replace
sysuse auto.dta, clear
gen id_=n
drop make mpg weight length//丢掉一些数据
help merge
merge 1:1 id using "auto_tech.dta"
cd"D:\BaiduNetdiskDownload\stata application"
use "nei_sample.dta",clear
edit newid year so2//调用这三个变量
sort newid year
//从小到大排序
//有些变量有些地方是没有观测值的，叫做非平衡样本
gsort newid -year
edit newid year facilityname_origin
gsort -facilityname_origin year
order so2 co newid year//按这个顺序展示这些变量
order newid, before(co)//把某个变量提到某个变量之前

//string variable字符串变量
use"nei_sample.dta",clear
edit newid facilityname_origin year
sort newid facilityname_origin year
gen facility_name = facilityname_origin //生成一个变量
edit facility_name facilityname_origi
replace facility_name = lower(facility_name)//变量名称小写化
upper //变量名称大写花

//trim ltrim rtrim 去掉空格zuo you zhong
replace facility_name = trim(facility_name)
edit facility_name
replace facility_name = ltrim(facility_name)
replace facility_name = rtrim(facility_name)
replace facility_name = subinstr(facility_name,","," ",.)
replace facility_name = subinstr(facility_name,"."," ",.)
replace facility_name = subinstr(facility_name,"/"," ",.)
replace facility_name = subinstr(facility_name,"#"," ",.)
//替代标点，全部替代用空格替代
replace facility_name = subinstr(facility_name,"："," ",.)
replace facility_name = subinstr(facility_name,"’"," ",.)
replace facility_name = subinstr(facility_name,"**"," ",.)
replace facility_name = subinstr(facility_name,":"," ",.)

replace facility_name = subinword( facility_name,"company"," ",.)
replace facility_name = subinstr(facility_name,"co"," ",.)
replace facility_name = subinstr(facility_name,"inc"," ",.)
replace facility_name = subinstr(facility_name,"lp"," ",.)
replace facility_name = ltrim(facility_name)
replace facility_name = subinstr(facility_name,"u s","us",.)

gen flag = 1 if regexm(facility_name,"u s")==1
//生成新的变量将带有u s 的变量标注为一，帮助寻找
gen flag2 = 1 if regexm(facility_name,"us")==1
split facility_name
gen fac_name = facility_name1+" "+facility name2

edit zipcode
split zipcode,parse(-)
//按照某种符号拆分字符串
edit zipcode
help substr //截取
gen zip5=substr(zipcode,1,5)//生成zip5，表示截取zipcode的前五位
edit zipcode zip5 if length(zip5) ~=5 //展示长度不等于5的zip5和zipcode
edit zip5
gen len_cn = ustrlen(zipcode) //生成中文字符串长度
edit fips
gen fips2 = substr(fips, 1,2)
edit fips2
gen fips3 = substr(fips, 3,3)
edit fips2 fips3
destring fips2, replace force
destring fips3, replace force
tostring fips2 fips3, replace force
edit fips2 fips3
replace fips2="0"+fips2 if length(fips2)==1
replace fips3="0"+fips3 if length(fips3)==2
replace fips3="00"+fips3 if length(fips3)==1
//前面用零补齐，补成五位

help duplicates//重复观测值
sort newid
duplicates report newid year //报告重复观测值
duplicates tag newid year, gen(dup)
tab dup//展示
edit new year if dup>=177
duplicates drop newid year,force //去掉重复样本
duplicates report newid year
ssc install unique //安装unique
unique newid year//展示有几个是唯一的
unique fips
collapse (sum) so2 co nox nh3 voc (first)facilityname_origin fips zipcode, by(newid year)
//关于newid year重复的字符串变量，只取第一个，数值变量加总
collapse (sum) so2 co nox nh3 voc (count)newid , by(fips year)
//关于fips year 加总。。。数出newid

//改变面板数据的结构
help reshape
keep newid year so2

duplicates drop newid year,force
reshape wide so2 , i(newid) j(year)

reshape long so2, i(newid) j(year)

//
duplicates drop newid year, force
unique newid year
keep newid year so2 co nox voc nh3 sic
reshape wide so2 co nox voc nh3, i(newid sic) j(year)
//数据变少了是因为有的newid对应多个sic
reshape long so2 co nox voc nh3, i(newid sic) j(year)
//reshape来回两次就是平衡面板数据
//quiz reshape id- year-pollutant-emissions
keep newid year so2 co nox voc
ren (so2 co nox voc)(pol1 pol2 pol3 pol4 )

reshape long pol, i(newid year) j(type)

tostring type,replace force
replace type= "so2" if type=="1"

//滞后
edit newid year so2

duplicates drop newid year,force

edit newid year so2

sort newid year

by newid: gen lag1so2=so2[_n-1]
//滞后一行，不一定滞后一期

by newid:gen f1so2=so2[_n+1]

bys newid: gen Nso2 = so2[_N]//最后一期

//滞后一期，解决不平衡面板
xtset newid year

gen lso2=l.so2
//
duplicates drop newid year,force
edit fips newid year
sort fips newid year
by fips year: egen id_sum=count (newid)
edit fips year newid so2
by fips year:egen so2_fips=sum(so2)
//
collapse(sum) so2 co nox nh3 voc (first) facilityname_origin fips
//加总一个地区的所有公司的污染构造这个地区的总污染量
duplicates report newid year
collapse (sum) so2 co nox nh3 voc (count) newid ,by(fips year)
//collapse by 2-digit sic and fips_state and year
gen state = substr(fips,1,2)
gen sic2 = substr(sic,1,2)
collapse (sum) so2 co nox nh3 voc ,by(state sic2 year )

python 数据分析，R语言与数据挖掘｜学习资料分享 05

python 数据分析

为什么选 python 做数据分析？

在数据分析和交互、探索性计算以及数据可视化等方面，Python 将不可避免地接近于其他开源和商业的领域特定编程语言/工具，如R、MATLAB、SAS、Stata等。近年来，由于 Python 有不断改良的库（主要是 pandas），使其成为数据处理任务的一大替代方案。结合其在通用编程方面的强大实力，我们完全可以只使用 Python 这一种语言去构建以数据为中心的应用程序。

R语言是由 AT&T 贝尔实验室开发的一种用来进行数据探索、统计分析和作图的解释型语言。

数据挖掘（英语：Data mining），是数据库知识发现（英语：Knowledge-Discovery in Databases，简称：KDD)中的一个步骤。数据挖掘一般是指从大量的数据中通过算法搜索隐藏于其中信息的过程。数据挖掘通常与计算机科学有关，并通过统计、在线分析处理、情报检索、机器学习、专家系统（依靠过去的经验法则）和模式识别等诸多方法来实现上述目标。

技术分享图片

关于更新

本次分享以代码实战、电子书为主，包含中英文资料。后面会分享关于 python 的视频资料。

本次分享内容

python 数据分析
R 语言
数据挖掘

共勉

想要将来展翅飞翔，那么就要学会珍惜自己的羽毛

地址

公众号「你未读」，回复关键字 python数据分析1 即可。

技术分享图片

以上是关于可以不学习stata，只学习r语言吗的主要内容，如果未能解决你的问题，请参考以下文章