###使用python将matlab文件转换成csv文件
####说明 因为项目需要,需要将matlab的文件读取出来,并转换成csv各式
####环境 python2.6.6 centos6.4
####调用了python模块
1
2
3
import scipy.io
import pandas as pd
import numpy as np
scipy用来加载mat文件 numpy用来解析mat文件的结构体,转成多维数据结构 pandas将多维数据转换成dataframe后保存为csv文件
####读取matlab结构体文件
拿期货数据举例
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import scipy.io
import pandas as pd
import numpy as np
code = scipy.io.loadmat('/fitl/TDB_FURTURE/futurecode/matlab.mat')
>>> code.keys()
['code', '__version__', '__header__', '__globals__']
>>> code = code.get('code')
>>> type(code)
<type 'numpy.ndarray'>
>>> code.ndim
2
>>> code.shape
(1, 1)
>>> code.dtype
dtype([('WindCode', 'O'), ('Code', 'O'), ('Market', 'O'), ('CNName', 'O'), ('ENName', 'O'), ('Type', 'O')])
>>> code.size
1
说明:scipy加载的mat文件后是字典类型,然后获取下标为‘code’的字典的值,为numpy.ndarray多维数据结构;
####解析多维数组的数据
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
>>> code[0][0][0][:]
array([u'399001.SZ', u'399002.SZ', u'399003.SZ', ..., u'TT1512.CF',
u'TT1512-S.CF', u'y.CF'],
dtype='<U48')
>>> code[0][0][1][:]
array([u'399001', u'399002', u'399003', ..., u'TT1512', u'TT1512-S', u'y'],
dtype='<U48')
>>> code[0][0][2][:]
array([u'SZ', u'SZ', u'SZ', ..., u'CF', u'CF', u'CF'],
dtype='<U48')
>>> code[0][0][3][:]
array([u'\u6df1\u8bc1\u6210\u6307', u'\u6df1\u6210\u6307R',
u'\u6210\u4efd\uff22\u6307', ..., u'TT1512(\u4eff\u771f)',
u'TT1512(\u4eff\u771f)', u''],
dtype='<U48')
>>> code[0][0][4][:]
array([u'', u'', u'', ..., u'', u'', u''],
dtype='<U48')
>>> code[0][0][5][:]
array([u'\u4ea4\u6613\u6240\u6307\u6570',
u'\u4ea4\u6613\u6240\u6307\u6570',
u'\u4ea4\u6613\u6240\u6307\u6570', ..., u'\u6307\u6570\u671f\u8d27',
u'\u672a\u77e5', u'\u672a\u77e5'],
dtype='<U48')
说明:获取多维数组的各个维度数据,均采用unicode编码
####讲多维数组转换成数据框
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
>>> windcode = pd.DataFrame.from_records(code[0][0][0][:])
>>> new_code = code[0][0][0][:].reshape(code[0][0][0][:].size,1)
>>> new_code
array([[u'399001.SZ'],
[u'399002.SZ'],
[u'399003.SZ'],
...,
[u'TT1512.CF'],
[u'TT1512-S.CF'],
[u'y.CF']],
dtype='<U48')
>>> windcode = pd.DataFrame.from_records(new_code,columns=['windcode'])
>>> windcode
windcode
0 399001.SZ
1 399002.SZ
2 399003.SZ
3 399004.SZ
4 399005.SZ
5 399100.SZ
6 399101.SZ
7 399106.SZ
8 399107.SZ
9 399108.SZ
10 399110.SZ
11 399120.SZ
12 399130.SZ
13 399131.SZ
14 399132.SZ
15 399133.SZ
16 399134.SZ
17 399135.SZ
18 399136.SZ
19 399137.SZ
20 399138.SZ
21 399139.SZ
22 399140.SZ
23 399150.SZ
24 399160.SZ
25 399170.SZ
26 399180.SZ
27 399190.SZ
28 399200.SZ
29 399210.SZ
... ...
22296 T1412.CF
22297 T1503.CF
22298 T1506.CF
22299 T1509.CF
22300 T1512.CF
22301 T1603.CF
22302 T1606.CF
22303 TF1312.CF
22304 TF1403.CF
22305 TF1406.CF
22306 TF1409.CF
22307 TF1412.CF
22308 TF1503.CF
22309 TF1506.CF
22310 TF1509.CF
22311 TF1512.CF
22312 TF1603.CF
22313 TF1606.CF
22314 TFC1.CF
22315 TFC2.CF
22316 TFC3.CF
22317 TT1412.CF
22318 TT1503.CF
22319 TT1506.CF
22320 TT1506-S.CF
22321 TT1509.CF
22322 TT1509-S.CF
22323 TT1512.CF
22324 TT1512-S.CF
22325 y.CF
>>> windcode.join(dcode)
windcode code
0 399001.SZ 399001
1 399002.SZ 399002
2 399003.SZ 399003
3 399004.SZ 399004
4 399005.SZ 399005
5 399100.SZ 399100
6 399101.SZ 399101
7 399106.SZ 399106
8 399107.SZ 399107
9 399108.SZ 399108
10 399110.SZ 399110
11 399120.SZ 399120
12 399130.SZ 399130
13 399131.SZ 399131
14 399132.SZ 399132
15 399133.SZ 399133
16 399134.SZ 399134
17 399135.SZ 399135
18 399136.SZ 399136
19 399137.SZ 399137
20 399138.SZ 399138
21 399139.SZ 399139
22 399140.SZ 399140
23 399150.SZ 399150
24 399160.SZ 399160
25 399170.SZ 399170
26 399180.SZ 399180
27 399190.SZ 399190
28 399200.SZ 399200
29 399210.SZ 399210
... ... ...
22296 T1412.CF T1412
22297 T1503.CF T1503
22298 T1506.CF T1506
22299 T1509.CF T1509
22300 T1512.CF T1512
22301 T1603.CF T1603
22302 T1606.CF T1606
22303 TF1312.CF TF1312
22304 TF1403.CF TF1403
22305 TF1406.CF TF1406
22306 TF1409.CF TF1409
22307 TF1412.CF TF1412
22308 TF1503.CF TF1503
22309 TF1506.CF TF1506
22310 TF1509.CF TF1509
22311 TF1512.CF TF1512
22312 TF1603.CF TF1603
22313 TF1606.CF TF1606
22314 TFC1.CF TFC1
22315 TFC2.CF TFC2
22316 TFC3.CF TFC3
22317 TT1412.CF TT1412
22318 TT1503.CF TT1503
22319 TT1506.CF TT1506
22320 TT1506-S.CF TT1506-S
22321 TT1509.CF TT1509
22322 TT1509-S.CF TT1509-S
22323 TT1512.CF TT1512
22324 TT1512-S.CF TT1512-S
22325 y.CF y
[22326 rows x 2 columns]
说明:使用强大的pandas将多维数组转成dataframe; 将多个dataframe连接成一个大的dataframe
####dataframe保存为csv
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
>>> windcode.join(dcode).to_csv('/opt/wind_code.csv',encoding='utf-8',index=False)
[root@client opt]# more wind_code.csv
windcode,code
399001.SZ,399001
399002.SZ,399002
399003.SZ,399003
399004.SZ,399004
399005.SZ,399005
399100.SZ,399100
399101.SZ,399101
399106.SZ,399106
399107.SZ,399107
399108.SZ,399108
399110.SZ,399110
399120.SZ,399120
399130.SZ,399130
399131.SZ,399131
399132.SZ,399132
399133.SZ,399133
备注:numpy和pandas这两个库还是非常的强大,处理数据的速度也是很优秀
####一个完整的python程序
功能:读取2015年的股指期货行情数据,mat文件的结构体,并将结构体内的数据抽取出来,转换成dataframe,然后保存为csv文件。一共1亿6千多万条数据。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/python
#coding:utf-8
import scipy.io as sio
import csv
import os
import os.path
import traceback
import time
import numpy as np
import pandas as pd
rootdir = "/fitl/TDB_FURTURE/futureab/2015_Month/"
csvdir = "/fitl/TDB_FURTURE/futureab/2015_csv/"
for parent,dirnames,filenames in os.walk(rootdir):
for filename in filenames:
if os.path.splitext(filename)[1] == '.mat':
f_name = os.path.join(parent,filename)
f_name_1 = f_name.split("/")
f_name_2 = f_name_1[len(f_name_1)-1]
file_name = f_name.replace('.mat','')
csv_f = csvdir+f_name_2 + '.csv'
isexist = os.path.exists(csvdir+file_name + '.csv')
#f_year = int(file_name[0:4])#string.atoi(file_name[0:4])
#f_month = int(file_name[4:6]) #string.atoi()
#f_day = int(file_name[6:8])
print('convert mat file is :')
print(f_name)
print(file_name)
print(csv_f)
if not isexist:
try:
print('read mat file..........\n')
#读取mat结构体文件
data = sio.loadmat(f_name)
data = data.get('futureab')
#解析第一个变量
print('dataframe..........\n')
WindCode = pd.DataFrame.from_records(data[0][0][0][:].reshape(data[0][0][0][:].size,1),columns=['WindCode'])
Code = pd.DataFrame.from_records(data[0][0][1][:].reshape(data[0][0][1][:].size,1),columns=['Code'])
Date = pd.DataFrame.from_records(data[0][0][2][:].reshape(data[0][0][2][:].size,1),columns=['Date'])
Time = pd.DataFrame.from_records(data[0][0][3][:].reshape(data[0][0][3][:].size,1),columns=['Time'])
Volume = pd.DataFrame.from_records(data[0][0][4][:].reshape(data[0][0][4][:].size,1),columns=['Volume'])
Turover = pd.DataFrame.from_records(data[0][0][5][:].reshape(data[0][0][5][:].size,1),columns=['Turover'])
Settle = pd.DataFrame.from_records(data[0][0][6][:].reshape(data[0][0][6][:].size,1),columns=['Settle'])
Position = pd.DataFrame.from_records(data[0][0][7][:].reshape(data[0][0][7][:].size,1),columns=['Position'])
CurDelta = pd.DataFrame.from_records(data[0][0][8][:].reshape(data[0][0][8][:].size,1),columns=['CurDelta'])
TradeFlag = pd.DataFrame.from_records(data[0][0][9][:].reshape(data[0][0][9][:].size,1),columns=['TradeFlag'])
AccVolume = pd.DataFrame.from_records(data[0][0][10][:].reshape(data[0][0][10][:].size,1),columns=['AccVolume'])
AccTurover = pd.DataFrame.from_records(data[0][0][11][:].reshape(data[0][0][11][:].size,1),columns=['AccTurover'])
Open = pd.DataFrame.from_records(data[0][0][12][:].reshape(data[0][0][12][:].size,1),columns=['Open'])
High = pd.DataFrame.from_records(data[0][0][13][:].reshape(data[0][0][13][:].size,1),columns=['High'])
Low = pd.DataFrame.from_records(data[0][0][14][:].reshape(data[0][0][14][:].size,1),columns=['Low'])
Price = pd.DataFrame.from_records(data[0][0][15][:].reshape(data[0][0][15][:].size,1),columns=['Price'])
AskPrice = pd.DataFrame.from_records(data[0][0][16][:].reshape(data[0][0][16][:].size,1),columns=['AskPrice'])
AskVolume = pd.DataFrame.from_records(data[0][0][17][:].reshape(data[0][0][17][:].size,1),columns=['AskVolume'])
BidPrice = pd.DataFrame.from_records(data[0][0][18][:].reshape(data[0][0][18][:].size,1),columns=['BidPrice'])
BidVolume = pd.DataFrame.from_records(data[0][0][19][:].reshape(data[0][0][19][:].size,1),columns=['BidVolume'])
PreClose = pd.DataFrame.from_records(data[0][0][20][:].reshape(data[0][0][20][:].size,1),columns=['PreClose'])
PreSettle = pd.DataFrame.from_records(data[0][0][21][:].reshape(data[0][0][21][:].size,1),columns=['PreSettle'])
PrePosition = pd.DataFrame.from_records(data[0][0][22][:].reshape(data[0][0][22][:].size,1),columns=['PrePosition'])
#join矩阵
print('save to csv file..........\n')
WindCode.join(Code).join(Date).join(Time).join(Volume).join(Turover).join(Settle).join(Position).join(CurDelta).join(TradeFlag).join(AccVolume).join(AccTurover)\
.join(Open).join(High).join(Low).join(Price).join(AskPrice).join(AskVolume).join(BidPrice).join(BidVolume).join(PreClose).join(PreSettle).join(PrePosition)\
.to_csv(csv_f,encoding='utf-8',index=False)
print('successful write mat file into csv file!\n')
f_success = open('/opt/hj/w_success.csv','a')
w_success = csv.writer(f_success)
w_time = time.strftime('%Y-%m-%d %X',time.localtime(time.time()))
w_success.writerow([filename,'success',w_time])
f_success.close()
except:
print(file_name)
error_file_log = open('/opt/hj/error_mat_file.csv','a')
w_error_file = csv.writer(error_file_log)
w_error_file.writerow([filename])
error_file_log.flush()
error_file_log.close()
#print(filename)
error_log = open("/opt/hj/python_mat_error.txt",'a')
traceback.print_exc(file = error_log)
error_log.flush()
error_log.close()
continue