使用python将matlab文件转换成csv文件

Reading time ~16 minutes

###使用python将matlab文件转换成csv文件


####说明 因为项目需要,需要将matlab的文件读取出来,并转换成csv各式

####环境 python2.6.6 centos6.4

####调用了python模块

1
2
3
import scipy.io
import pandas as pd
import numpy as np

scipy用来加载mat文件 numpy用来解析mat文件的结构体,转成多维数据结构 pandas将多维数据转换成dataframe后保存为csv文件

####读取matlab结构体文件

拿期货数据举例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import scipy.io
import pandas as pd
import numpy as np
code = scipy.io.loadmat('/fitl/TDB_FURTURE/futurecode/matlab.mat')
>>> code.keys()
['code', '__version__', '__header__', '__globals__']
>>> code = code.get('code')
>>> type(code)
<type 'numpy.ndarray'>
>>> code.ndim
2
>>> code.shape
(1, 1)
>>> code.dtype
dtype([('WindCode', 'O'), ('Code', 'O'), ('Market', 'O'), ('CNName', 'O'), ('ENName', 'O'), ('Type', 'O')])
>>> code.size
1

说明:scipy加载的mat文件后是字典类型,然后获取下标为‘code’的字典的值,为numpy.ndarray多维数据结构;

####解析多维数组的数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
>>> code[0][0][0][:]
array([u'399001.SZ', u'399002.SZ', u'399003.SZ', ..., u'TT1512.CF',
       u'TT1512-S.CF', u'y.CF'],
      dtype='<U48')
>>> code[0][0][1][:]
array([u'399001', u'399002', u'399003', ..., u'TT1512', u'TT1512-S', u'y'],
      dtype='<U48')
>>> code[0][0][2][:]
array([u'SZ', u'SZ', u'SZ', ..., u'CF', u'CF', u'CF'],
      dtype='<U48')
>>> code[0][0][3][:]
array([u'\u6df1\u8bc1\u6210\u6307', u'\u6df1\u6210\u6307R',
       u'\u6210\u4efd\uff22\u6307', ..., u'TT1512(\u4eff\u771f)',
       u'TT1512(\u4eff\u771f)', u''],
      dtype='<U48')
>>> code[0][0][4][:]
array([u'', u'', u'', ..., u'', u'', u''],
      dtype='<U48')
>>> code[0][0][5][:]
array([u'\u4ea4\u6613\u6240\u6307\u6570',
       u'\u4ea4\u6613\u6240\u6307\u6570',
       u'\u4ea4\u6613\u6240\u6307\u6570', ..., u'\u6307\u6570\u671f\u8d27',
       u'\u672a\u77e5', u'\u672a\u77e5'],
      dtype='<U48')

说明:获取多维数组的各个维度数据,均采用unicode编码

####讲多维数组转换成数据框

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
>>> windcode = pd.DataFrame.from_records(code[0][0][0][:])
>>> new_code = code[0][0][0][:].reshape(code[0][0][0][:].size,1)
>>> new_code
array([[u'399001.SZ'],
       [u'399002.SZ'],
       [u'399003.SZ'],
       ...,
       [u'TT1512.CF'],
       [u'TT1512-S.CF'],
       [u'y.CF']],
      dtype='<U48')
>>> windcode = pd.DataFrame.from_records(new_code,columns=['windcode'])
>>> windcode
          windcode
0        399001.SZ
1        399002.SZ
2        399003.SZ
3        399004.SZ
4        399005.SZ
5        399100.SZ
6        399101.SZ
7        399106.SZ
8        399107.SZ
9        399108.SZ
10       399110.SZ
11       399120.SZ
12       399130.SZ
13       399131.SZ
14       399132.SZ
15       399133.SZ
16       399134.SZ
17       399135.SZ
18       399136.SZ
19       399137.SZ
20       399138.SZ
21       399139.SZ
22       399140.SZ
23       399150.SZ
24       399160.SZ
25       399170.SZ
26       399180.SZ
27       399190.SZ
28       399200.SZ
29       399210.SZ
...            ...
22296     T1412.CF
22297     T1503.CF
22298     T1506.CF
22299     T1509.CF
22300     T1512.CF
22301     T1603.CF
22302     T1606.CF
22303    TF1312.CF
22304    TF1403.CF
22305    TF1406.CF
22306    TF1409.CF
22307    TF1412.CF
22308    TF1503.CF
22309    TF1506.CF
22310    TF1509.CF
22311    TF1512.CF
22312    TF1603.CF
22313    TF1606.CF
22314      TFC1.CF
22315      TFC2.CF
22316      TFC3.CF
22317    TT1412.CF
22318    TT1503.CF
22319    TT1506.CF
22320  TT1506-S.CF
22321    TT1509.CF
22322  TT1509-S.CF
22323    TT1512.CF
22324  TT1512-S.CF
22325         y.CF

>>> windcode.join(dcode)
          windcode      code
0        399001.SZ    399001
1        399002.SZ    399002
2        399003.SZ    399003
3        399004.SZ    399004
4        399005.SZ    399005
5        399100.SZ    399100
6        399101.SZ    399101
7        399106.SZ    399106
8        399107.SZ    399107
9        399108.SZ    399108
10       399110.SZ    399110
11       399120.SZ    399120
12       399130.SZ    399130
13       399131.SZ    399131
14       399132.SZ    399132
15       399133.SZ    399133
16       399134.SZ    399134
17       399135.SZ    399135
18       399136.SZ    399136
19       399137.SZ    399137
20       399138.SZ    399138
21       399139.SZ    399139
22       399140.SZ    399140
23       399150.SZ    399150
24       399160.SZ    399160
25       399170.SZ    399170
26       399180.SZ    399180
27       399190.SZ    399190
28       399200.SZ    399200
29       399210.SZ    399210
...            ...       ...
22296     T1412.CF     T1412
22297     T1503.CF     T1503
22298     T1506.CF     T1506
22299     T1509.CF     T1509
22300     T1512.CF     T1512
22301     T1603.CF     T1603
22302     T1606.CF     T1606
22303    TF1312.CF    TF1312
22304    TF1403.CF    TF1403
22305    TF1406.CF    TF1406
22306    TF1409.CF    TF1409
22307    TF1412.CF    TF1412
22308    TF1503.CF    TF1503
22309    TF1506.CF    TF1506
22310    TF1509.CF    TF1509
22311    TF1512.CF    TF1512
22312    TF1603.CF    TF1603
22313    TF1606.CF    TF1606
22314      TFC1.CF      TFC1
22315      TFC2.CF      TFC2
22316      TFC3.CF      TFC3
22317    TT1412.CF    TT1412
22318    TT1503.CF    TT1503
22319    TT1506.CF    TT1506
22320  TT1506-S.CF  TT1506-S
22321    TT1509.CF    TT1509
22322  TT1509-S.CF  TT1509-S
22323    TT1512.CF    TT1512
22324  TT1512-S.CF  TT1512-S
22325         y.CF         y

[22326 rows x 2 columns]

说明:使用强大的pandas将多维数组转成dataframe; 将多个dataframe连接成一个大的dataframe

####dataframe保存为csv

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
>>> windcode.join(dcode).to_csv('/opt/wind_code.csv',encoding='utf-8',index=False)
[root@client opt]# more wind_code.csv
windcode,code
399001.SZ,399001
399002.SZ,399002
399003.SZ,399003
399004.SZ,399004
399005.SZ,399005
399100.SZ,399100
399101.SZ,399101
399106.SZ,399106
399107.SZ,399107
399108.SZ,399108
399110.SZ,399110
399120.SZ,399120
399130.SZ,399130
399131.SZ,399131
399132.SZ,399132
399133.SZ,399133

备注:numpy和pandas这两个库还是非常的强大,处理数据的速度也是很优秀

####一个完整的python程序

功能:读取2015年的股指期货行情数据,mat文件的结构体,并将结构体内的数据抽取出来,转换成dataframe,然后保存为csv文件。一共1亿6千多万条数据。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/python
#coding:utf-8
import scipy.io as sio
import csv
import os
import os.path
import traceback
import time
import numpy as np
import pandas as pd

rootdir = "/fitl/TDB_FURTURE/futureab/2015_Month/"
csvdir = "/fitl/TDB_FURTURE/futureab/2015_csv/"
for parent,dirnames,filenames in os.walk(rootdir):
    for filename in filenames:
        if os.path.splitext(filename)[1] == '.mat':
            f_name = os.path.join(parent,filename)
            f_name_1 = f_name.split("/")
            f_name_2 = f_name_1[len(f_name_1)-1]
            file_name = f_name.replace('.mat','')
            csv_f = csvdir+f_name_2 + '.csv'
            isexist = os.path.exists(csvdir+file_name + '.csv')
            #f_year = int(file_name[0:4])#string.atoi(file_name[0:4])
            #f_month = int(file_name[4:6]) #string.atoi()
            #f_day = int(file_name[6:8])
            print('convert mat file is :')
            print(f_name)
            print(file_name)
	    print(csv_f)
            if not isexist:
                try:
                    print('read mat file..........\n')
                    #读取mat结构体文件
                    data = sio.loadmat(f_name)
		    data = data.get('futureab')
                    #解析第一个变量
		    print('dataframe..........\n')
		    WindCode = pd.DataFrame.from_records(data[0][0][0][:].reshape(data[0][0][0][:].size,1),columns=['WindCode'])
                    Code = pd.DataFrame.from_records(data[0][0][1][:].reshape(data[0][0][1][:].size,1),columns=['Code'])
                    Date = pd.DataFrame.from_records(data[0][0][2][:].reshape(data[0][0][2][:].size,1),columns=['Date'])
                    Time = pd.DataFrame.from_records(data[0][0][3][:].reshape(data[0][0][3][:].size,1),columns=['Time'])
                    Volume = pd.DataFrame.from_records(data[0][0][4][:].reshape(data[0][0][4][:].size,1),columns=['Volume'])
                    Turover = pd.DataFrame.from_records(data[0][0][5][:].reshape(data[0][0][5][:].size,1),columns=['Turover'])
                    Settle = pd.DataFrame.from_records(data[0][0][6][:].reshape(data[0][0][6][:].size,1),columns=['Settle'])
                    Position = pd.DataFrame.from_records(data[0][0][7][:].reshape(data[0][0][7][:].size,1),columns=['Position'])
                    CurDelta = pd.DataFrame.from_records(data[0][0][8][:].reshape(data[0][0][8][:].size,1),columns=['CurDelta'])
                    TradeFlag = pd.DataFrame.from_records(data[0][0][9][:].reshape(data[0][0][9][:].size,1),columns=['TradeFlag'])
                    AccVolume = pd.DataFrame.from_records(data[0][0][10][:].reshape(data[0][0][10][:].size,1),columns=['AccVolume'])
                    AccTurover = pd.DataFrame.from_records(data[0][0][11][:].reshape(data[0][0][11][:].size,1),columns=['AccTurover'])
                    Open = pd.DataFrame.from_records(data[0][0][12][:].reshape(data[0][0][12][:].size,1),columns=['Open'])
                    High = pd.DataFrame.from_records(data[0][0][13][:].reshape(data[0][0][13][:].size,1),columns=['High'])
                    Low = pd.DataFrame.from_records(data[0][0][14][:].reshape(data[0][0][14][:].size,1),columns=['Low'])
                    Price = pd.DataFrame.from_records(data[0][0][15][:].reshape(data[0][0][15][:].size,1),columns=['Price'])
                    AskPrice = pd.DataFrame.from_records(data[0][0][16][:].reshape(data[0][0][16][:].size,1),columns=['AskPrice'])
                    AskVolume = pd.DataFrame.from_records(data[0][0][17][:].reshape(data[0][0][17][:].size,1),columns=['AskVolume'])
                    BidPrice = pd.DataFrame.from_records(data[0][0][18][:].reshape(data[0][0][18][:].size,1),columns=['BidPrice'])
                    BidVolume = pd.DataFrame.from_records(data[0][0][19][:].reshape(data[0][0][19][:].size,1),columns=['BidVolume'])
                    PreClose = pd.DataFrame.from_records(data[0][0][20][:].reshape(data[0][0][20][:].size,1),columns=['PreClose'])
                    PreSettle = pd.DataFrame.from_records(data[0][0][21][:].reshape(data[0][0][21][:].size,1),columns=['PreSettle'])
                    PrePosition = pd.DataFrame.from_records(data[0][0][22][:].reshape(data[0][0][22][:].size,1),columns=['PrePosition'])
                    #join矩阵
		    print('save to csv file..........\n')
                    WindCode.join(Code).join(Date).join(Time).join(Volume).join(Turover).join(Settle).join(Position).join(CurDelta).join(TradeFlag).join(AccVolume).join(AccTurover)\
                    .join(Open).join(High).join(Low).join(Price).join(AskPrice).join(AskVolume).join(BidPrice).join(BidVolume).join(PreClose).join(PreSettle).join(PrePosition)\
                    .to_csv(csv_f,encoding='utf-8',index=False)
                    print('successful write mat file into csv file!\n')
                        
                    f_success = open('/opt/hj/w_success.csv','a')
                    w_success = csv.writer(f_success)
                    w_time = time.strftime('%Y-%m-%d %X',time.localtime(time.time()))
                    w_success.writerow([filename,'success',w_time])
                    f_success.close()
                    
                except:
                    print(file_name)
                    error_file_log = open('/opt/hj/error_mat_file.csv','a')
                    w_error_file = csv.writer(error_file_log)
                    w_error_file.writerow([filename])
                    error_file_log.flush()
                    error_file_log.close()
                    #print(filename)
                    error_log = open("/opt/hj/python_mat_error.txt",'a')
                    traceback.print_exc(file = error_log)
                    error_log.flush()
                    error_log.close()
                    continue

Puppet证书过期处理

Published on November 12, 2018

沪牌拍牌有多难

Published on March 12, 2018