人口年龄结构曲线的构造

人口年龄结构曲线的构造

我的爬虫入门程序! 回想这么久以来, 真是感慨万千, 那个时候自己编程只会用Stata和R, 还不熟练、写文章只会用Word, 写作效率非常低. 一年看起来短, 但回想起来却好久远!

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
clear all
set more of
global PATH "D:\Desktop\婚姻市场\数据"
cd $PATH
postfile mypost v1 v2 v3 v4 v5 v6 v7 ///
using population2010.dta, replace
//这里使用了post命令,post命令可以非常方便的把一个变
//量拆分成数个变量
unicode encoding set gb18030
unicode translate population2010.txt,transutf8
unicode erasebackups, badidea
//数据导入出现了乱码,因此加入这三行转码。
infix strL v 1-20000 using population2010.txt,clear
//导入数据,下面是通过分析源代码从中提取需要的数字
keep if index(v,"x:")|index(v, ///
"<td class=xl281013>")|index(v, ///
"<td class=xl311013>")
drop if index(v,"height")
gen v1 = ustrregexs(1) if ustrregexm(v, ">(.*)<")
drop if _n == 1
drop if _n == 1
drop if _n == 1
drop v
gen v = real(v1)
drop v1
forvalues i = 1(7)`=_N' {
post mypost (v[`i']) (v[`i'+1]) (v[`i'+2]) ///
(v[`i'+3]) (v[`i'+4]) (v[`i'+5]) (v[`i'+6])
}
//使用post命令发送数据
postclose mypost
use population2010, clear
//下面是命名变量标签
label var v1 "2010人口总数合计"
label var v2 "2010男性总数合计"
label var v3 "2010女性总数合计"
label var v4 "2010总人数占人口比重"
label var v5 "2010总男性总数占人口总数"
label var v6 "2010总女性总数占人口总数"
label var v7 "2010性别比"
drop if _n == 1 //第一行是合计
drop if mod(_n,6) == 1 & _n ~= _N
gen age = _n-1
save population2010, replace
//下面被注释掉的部分是使用核密度回归线拟合散点图
// kernreg1 v5 age, k(3) np(100) gen(a1 r_grid1)
// kernreg1 v6 age, k(3) np(100) gen(a2 r_grid2)
// twoway line a1 r_grid1, lp("-")|| ///
// line a2 r_grid2, lp("_")
clear all
set more off
global PATH "D:\Desktop\婚姻市场\数据"
cd $PATH
postfile mypost v8 v9 v10 v11 v12 v13 v14 ///
using population2000.dta, replace
unicode encoding set gb18030
unicode translate population2000.txt,transutf8
unicode erasebackups, badidea
infix strL v 1-20000 using population2000.txt,clear
keep if index(v,"x:")|index(v, ///
"<td class=xl3610554>")|index(v, ///
"<td class=xl3510554>")
drop if index(v,"height")
gen v1 = ustrregexs(1) if ustrregexm(v, ">(.*)<")
drop if _n == 1
drop if _n == 1
forvalues j = 0/20{
forvalues i = 0/3{
drop if _n == 8 + `j'*42
}
}
drop v
gen v = real(v1)
drop v1
forvalues i = 1(7)`=_N' {
post mypost (v[`i']) (v[`i'+1]) (v[`i'+2]) ///
(v[`i'+3]) (v[`i'+4]) (v[`i'+5]) (v[`i'+6])
}
postclose mypost
use population2000, clear
label var v8 "2010人口总数合计"
label var v9 "2010男性总数合计"
label var v10 "2010女性总数合计"
label var v11 "2010总人数占人口比重"
label var v12 "2010总男性总数占人口总数"
label var v13 "2010总女性总数占人口总数"
label var v14 "2010性别比"
drop if _n == 1 //第一行是合计
drop if mod(_n,6) == 1 & _n ~= _N
gen age = _n-1
save population2000, replace
//合并两个数据集
use population2010, clear
merge 1:1 age using population2000
tab _merge
drop _merge
save 0010population.dta, replace
//使用核密度估计拟合两次人口普查两个性别的四条年龄结构曲线和百分比结构曲线
clear all
set more off
use 0010population, clear
qui kernreg1 v12 age , k(3) np(100) gen(a00malepercent r_grid1)
qui kernreg1 v13 age , k(3) np(100) gen(a00femalepercent r_grid2)
qui kernreg1 v5 age , k(3) np(100) gen(a10malepercent r_grid3)
qui kernreg1 v6 age , k(3) np(100) gen(a10femalepercent r_grid4)
grss twoway line a00malepercent r_grid1, lp("-") || ///
line a00femalepercent r_grid2, lp("_") || ///
line a10malepercent r_grid3, lp("*") || ///
line a10femalepercent r_grid4, lp("+")
qui kernreg1 v9 age , k(3) np(100) gen(a00male r_grid5)
qui kernreg1 v10 age , k(3) np(100) gen(a00female r_grid6)
qui kernreg1 v2 age , k(3) np(100) gen(a10male r_grid7)
qui kernreg1 v3 age , k(3) np(100) gen(a10female r_grid8)
grss twoway line a00male r_grid5, lp("-") || ///
line a00female r_grid6, lp("_") || ///
line a10male r_grid7, lp("*") || ///
line a10female r_grid8, lp("+")

# Stata

评论

Your browser is out-of-date!

Update your browser to view this website correctly. Update my browser now

×