python3脚本使用sql loader批量导入字节文件并二次处理

来源：互联网发布：mac osx yosemite.vdi 编辑：程序博客网时间：2024/06/06 03:09

由于使用cx_oracle连接oracle然后导入数据文件速度太慢（要导的数据有几百G），出于速度的考虑，选择了oracle自带的sql loader工具，使用python脚本写界面，与用户交互，然后使用system（）去批量调用sql loader，对于导入失败的bad数据以及日志数据，我用了c++写了一个程序，把bad文件转换成了ok的数据，然后又写了一个脚本处理这些bad数据。
1、python3调用sqlldr工具导入数据的脚本实现。

#!C:\Python34\python.exe'''by dairen 20170703'''from tkinter import *import tkinter.messagebox as messageboximport cx_Oracleimport timeimport datetimeimport osclass Application(Frame):    def __init__(self, master=None):        Frame.__init__(self, master,height=2000,width=4000)        self.pack()        self.createWidgets()    def createWidgets(self):        self.label_1 = Label(self,text = "文件起始日期:")        self.label_2 = Label(self,text = "文件结束日期:")        self.label_1.grid(row = 0,column = 0)        self.label_2.grid(row = 1,column = 0)        self.Input1 = Entry(self)        self.Input2 = Entry(self)        self.Input1.grid(row = 0,column = 1)        self.Input2.grid(row = 1,column = 1)        self.Button1 = Button(self, text='开始导入',width=20, command=self.runmain)        self.Button1.grid(row =2,columnspan=2)    def runmain(self):        begin_date = self.Input1.get() or '20170607'        end_date = self.Input2.get() or '20170607'        messagebox.showinfo('Tips!', '您的输入是：%s 和 %s \n 导入即将开始！' %(begin_date,end_date))        #offset:358O b_Info 40-60        aaa=[' CHAR(42)',' CHAR(42)',' CHAR(42)',' CHAR(8)',' CHAR(3)',         ' CHAR(1)',' CHAR(1)',' CHAR(1)',' CHAR(1)',' CHAR(1)',         ' CHAR(6)',' CHAR(10)',' CHAR(11)',' CHAR(11)',' CHAR(11)',         ' CHAR(11)',' CHAR(11)',' CHAR(21)',' CHAR(21)',' CHAR(21)',         ' CHAR(4)',' CHAR(6)',' CHAR(12)',' CHAR(4)',' CHAR(6)',         ' CHAR(4)',' CHAR(3)',' CHAR(2)',' CHAR(12)',' CHAR(6)',         ' CHAR(8)',' CHAR(15)',' CHAR(65)',' CHAR(3)',' CHAR(4)',         ' CHAR(6)',' CHAR(10)',' CHAR(1)',' CHAR(1)',' CHAR(5)'        ,' CHAR(2)',' CHAR(2)',' CHAR(2)',' CHAR(2)',' CHAR(4)'        ,' CHAR(4)',' CHAR(11)',' CHAR(11)',' NUMBER(12,0)',' NUMBER(12,0)'        ,' NUMBER(12,0)',' NUMBER(12,0)',' CHAR(3)',' CHAR(3)',' NUMBER(8,0)'        ,' CHAR(1)',' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)'        ,' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)'        ,' NUMBER(8,0)',' CHAR(8)',' CHAR(8)',' CHAR(8)',' NUMBER(8,0)'        ,' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)'        ,' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)',' CHAR(16)',' CHAR(2)'        ,' CHAR(1)',' CHAR(2)',' CHAR(14)',' CHAR(4)',' CHAR(3)'        ,' CHAR(9)',' NUMBER(12,0)',' CHAR(3)',' CHAR(1)',' CHAR(8)'        ,' CHAR(1)',' CHAR(12)',' CHAR(15)',' CHAR(6)',' CHAR(4)'        ,' CHAR(100)']        lenlist=[42,42,42,8,3,1,1,1,1,1,        6,10,11,11,11,11,11,21,21,21,        4,6,12,4,6,4,3,2,12,6,        8,15,40,3,4,6,10,1,1,5,        2,2,2,2,4,4,11,11,12,12,        12,12,3,3,8,1,8,8,8,8,        8,8,8,8,8,8,8,8,8,8,        8,8,8,8,8,8,8,8,16,2,        1,2,14,4,3,9,12,3,1,8,        1,12,15,6,4,100]        title=[        'Main_Key','Origin_Key','Er_Key','PostDate','Curr',        'If_Curr','If_Cen','Cups_Sett','If_Cvm_In','If_Single',        'Ssn_Trans','Time_Trans','Ob_Code','Trans_Code','Rec_Code',        'Cb_Code','Cvm_Code','Account11','Account_In','Account_Out',        'MsgType','ProcCode','TranAmt','ActDate','ActTime',        'MerChant_Type','Sevr_Point_In','Sevr_Point_Con','Search','AuthCode',        'Term','Ob_Id','Ob_Info','Curr_Tran','Reason_Code',        'Ssn_Trans_Ori','Time_Trans_Ori','Status_Snd','Status_Rec','Status_Trans',        'RejCode1','RejCode2','RejCode3','RejCode4','Sett_Snd_AreaCode',        'Sett_Rec_AreaCode','Sett_Snd_Code','Sett_Rec_Code' ,'Amt_Snd_Sett_dr','Amt_Snd_Sett_cr',        'Amt_Rec_Sett_dr','Amt_Rec_Sett_cr','Curr_Sett_Snd','Curr_Sett_Rec','Comm',        'Comm_Way','Comm_Snd_dr','Comm_Snd_cr','Comm_Rec_dr','Comm_Rec_cr',        'Fee_Snd_dr','Fee_Snd_cr','Fee_Rec_dr','Fee_Rec_cr','Comm_Cen_dr',        'Comm_Cen_cr','Fee_Snd_Branch_dr','Fee_Snd_Branch_cr','Fee_Rec_Branch_dr','Fee_Rec_Branch_cr',        'Fee_CanCel_Snd_dr','Fee_CanCel_Snd_cr','Fee_CanCel_Rec_dr','Fee_CanCel_Rec_cr','Fee_CanCel_Cen_dr',        'Fee_CanCel_Cen_cr','Fee_Cl_Snd_Branch_dr','Fee_Cl_Snd_Branch_cr','Fee_Cl_Rec_Branch_dr','Fee_Cl_Rec_Branch_cr',        'TranType','CardKind','CardType','Bin','Card',        'Er_Max','Ssn_Cen','TranAmt_Ori','TranCode_Ori','Reserved',        'variab1','variab2','variab3','variab4','variab5','variab6'        ]        x=len(title)        fields=[]        fields = [title[i]+aaa[i] for i in range(0,x)]        fields_str = ', '.join(fields)    #begin_date=input("input start date(格式：20170506):")    #end_date=input("input end date(格式：20170606):")        date_list = []        begin_date = datetime.datetime.strptime(begin_date, "%Y%m%d")        end_date = datetime.datetime.strptime(end_date, "%Y%m%d")        if (begin_date > end_date):            messagebox.showinfo('error!', 'begin_date or end_date error!)')        while( begin_date <= end_date):            date_str = begin_date.strftime("%Y%m%d")            date_list.append(date_str)            begin_date += datetime.timedelta(days=1)        #print(date_list)        file_name_list=[]        numlist=['01','02','03','04']        for datename in date_list:            for num in numlist:                bsi_str='BSI%s01_%s'%(datename,num)                file_name_list.append(bsi_str)            for num in numlist:                bsi_str='BSI%s51_%s'%(datename,num)                file_name_list.append(bsi_str)            file_name_list.append('BSI%s88_01'%datename)            file_name_list.append('BSI%s99_01'%datename)            for num in numlist:                bsa_str='BSA%s01_%s'%(datename,num)                file_name_list.append(bsa_str)            for num in numlist:                bsa_str='BSA%s51_%s'%(datename,num)                file_name_list.append(bsa_str)            file_name_list.append('BSA%s88_01'%datename)            file_name_list.append('BSA%s99_01'%datename)        #print(file_name_list)        table_list=[]        for datename in date_list:            bsi_str='BSI_%s'%datename            table_list.append(bsi_str)            bsa_str='BSA_%s'%datename            table_list.append(bsa_str)        #print(table_list)        #开始计时        starttime=time.clock()        try:            conn = cx_Oracle.connect('xxx/xxx@xxx/orcl')               #print("connect oracle success!!!")        except:            #print("connect oracle failed!!!")            messagebox.showinfo('warning!', 'failed connect oracle!')            return        cursor = conn.cursor()        data_num=len(date_list)        file_num=len(file_name_list)        table_num=len(table_list)        #create table        for k in range(0,table_num):            table_name= table_list[k]            #print("当前表名：%s "%table_name)            sql_createtb = "create table %s (%s)" % (table_name, fields_str)            sql_droptb='drop table %s purge'%table_name            #print(sql_createtb)            #print(sql_droptb)            try:                #print("开始建表！")                cursor.execute(sql_createtb)                #print("建表成功！")            except:                #print("表已存在！开始删表!")                cursor.execute(sql_droptb)                #print("删表成功！开始重新建表！")                cursor.execute(sql_createtb)                #print("重新建表成功！")        conn.commit()        cursor.close()           conn.close()        #execute cmd according to fiel_name        succ=0        errornum=0        for i in range(0,file_num):            try:                file_name=file_name_list[i]                   if os.path.isfile(file_name):                    table_name=table_list[i//10]                    #CHARACTERSET ZHS16GBK                    #edit ctl files according to table_name                    lines=open('bsa.ctl','r').readlines()                    for k in range(len(lines)):                        if 'INTO TABLE' in lines[k]:                            lines[k]='INTO TABLE %s\n'%table_name                    f=open('bsa.ctl','w')                    f.writelines(lines)                    f.close()                    basedir=os.getcwd()                    newname=file_name+".dat"                    os.rename(file_name,newname)                    bad_dir='%s\\bad_files'%basedir                    log_dir='%s\\log_files'%basedir                    if not os.path.exists(bad_dir):                        os.mkdir(bad_dir)                    if not os.path.exists(log_dir):                        os.mkdir(log_dir)                    #import                    cmd_str="sqlldr xxx/xxx@xxx/orcl control=%s\\bsa.ctl data=%s "%(basedir,newname)                    cmd_str+="bad=%s\\bad_files\\%s.bad log=%s\\log_files\\%s.log direct=true"%(basedir,file_name,basedir,file_name)                    #print(cmd_str)                    os.system(cmd_str)                    succ+=1                    #process bad file            except:                errornum+=1        endtime=time.clock()        messagebox.showinfo('完成！!','耗时约：%f秒,success in %d files'%((endtime-starttime),succ))app = Application()# 设置窗口标题:app.master.title('CUPS IMPORTER')# 主消息循环:app.mainloop()

2、sqlldr规则文件的编写.
sqlldr读取文件有好几种方式，我要处理的源文件是字节流的，所以使用了按字节分隔的控制文件，如下：

options(ERRORS=20000)load DATAAPPENDINTO TABLE BSA_20170607(Main_Key  POSITION(01:42)  CHAR,Origin_Key  POSITION(43:84)  CHAR,Er_Key  POSITION(85:126)   CHAR,PostDate  POSITION(127:134)   CHAR,Curr  POSITION(135:137)   CHAR,If_Curr  POSITION(138:138)  CHAR,If_Cen  POSITION(139:139)  CHAR,Cups_Sett  POSITION(140:140)  CHAR,If_Cvm_In  POSITION(141:141)  CHAR,If_Single  POSITION(142:142)  CHAR,Ssn_Trans  POSITION(143:148)  CHAR,Time_Trans  POSITION(149:158)  CHAR,Ob_Code  POSITION(159:169)  CHAR,Trans_Code  POSITION(170:180)  CHAR,Rec_Code  POSITION(181:191)  CHAR,Cb_Code  POSITION(192:202)  CHAR,Cvm_Code  POSITION(203:213)  CHAR,Account11  POSITION(214:234)  CHAR,Account_In  POSITION(235:255)  CHAR,Account_Out  POSITION(256:276)  CHAR,MsgType  POSITION(277:280)  CHAR,ProcCode  POSITION(281:286)  CHAR,TranAmt  POSITION(287:298)  CHAR,ActDate  POSITION(299:302)  CHAR,ActTime  POSITION(303:308)  CHAR,MerChant_Type  POSITION(309:312)  CHAR,Sevr_Point_In  POSITION(313:315)  CHAR,Sevr_Point_Con  POSITION(316:317)  CHAR,Search  POSITION(318:329)  CHAR,AuthCode  POSITION(330:335)  CHAR,Term  POSITION(336:343)  CHAR,Ob_Id  POSITION(344:358)  CHAR,Ob_Info  POSITION(359:398)  CHAR,Curr_Tran  POSITION(399:401)  CHAR,Reason_Code  POSITION(402:405)  CHAR,Ssn_Trans_Ori  POSITION(406:411)  CHAR,Time_Trans_Ori  POSITION(412:421)  CHAR,Status_Snd  POSITION(422:422)  CHAR,Status_Rec  POSITION(423:423)  CHAR,Status_Trans  POSITION(424:428)  CHAR,RejCode1  POSITION(429:430)  CHAR,RejCode2  POSITION(431:432)  CHAR,RejCode3  POSITION(433:434)  CHAR,RejCode4  POSITION(435:436)  CHAR,Sett_Snd_AreaCode  POSITION(437:440)  CHAR,Sett_Rec_AreaCode  POSITION(441:444)  CHAR,Sett_Snd_Code  POSITION(445:455)  CHAR,Sett_Rec_Code  POSITION(456:466)  CHAR,Amt_Snd_Sett_dr  POSITION(467:478),Amt_Snd_Sett_cr  POSITION(479:490),Amt_Rec_Sett_dr  POSITION(491:502),Amt_Rec_Sett_cr  POSITION(503:514),Curr_Sett_Snd  POSITION(515:517)  CHAR,Curr_Sett_Rec  POSITION(518:520)  CHAR,Comm  POSITION(521:528)   ,Comm_Way  POSITION(529:529)  CHAR,Comm_Snd_dr  POSITION(530:537)    ,Comm_Snd_cr  POSITION(538:545)   ,Comm_Rec_dr  POSITION(546:553)    ,Comm_Rec_cr  POSITION(554:561)   ,Fee_Snd_dr  POSITION(562:569)    ,Fee_Snd_cr  POSITION(570:577)   ,Fee_Rec_dr  POSITION(578:585)    ,Fee_Rec_cr  POSITION(586:593)   ,Comm_Cen_dr  POSITION(594:601)    ,Comm_Cen_cr  POSITION(602:609)    ,Fee_Snd_Branch_dr  POSITION(610:617)  CHAR,Fee_Snd_Branch_cr  POSITION(618:625)  CHAR,Fee_Rec_Branch_dr  POSITION(626:633)  CHAR,Fee_Rec_Branch_cr  POSITION(634:641)    ,Fee_CanCel_Snd_dr  POSITION(642:649)    ,Fee_CanCel_Snd_cr  POSITION(650:657)   ,Fee_CanCel_Rec_dr  POSITION(658:665)    ,Fee_CanCel_Rec_cr  POSITION(666:673)   ,Fee_CanCel_Cen_dr  POSITION(674:681)    ,Fee_CanCel_Cen_cr  POSITION(682:689)    ,Fee_Cl_Snd_Branch_dr  POSITION(690:697)   ,Fee_Cl_Snd_Branch_cr  POSITION(698:705)    ,Fee_Cl_Rec_Branch_dr  POSITION(706:721)  CHAR,Fee_Cl_Rec_Branch_cr  POSITION(722:723)  CHAR,TranType  POSITION(724:724)  CHAR,CardKind  POSITION(725:726)  CHAR,CardType  POSITION(727:740)  CHAR,Bin  POSITION(741:744)  CHAR,Card  POSITION(745:747)  CHAR,Er_Max  POSITION(748:756)  CHAR,Ssn_Cen  POSITION(757:768)    ,TranAmt_Ori  POSITION(769:771)  CHAR,TranCode_Ori  POSITION(772:772)  CHAR,Reserved  POSITION(773:780)  CHAR,variab1  POSITION(781:781)  CHAR,variab2  POSITION(782:793)  CHAR,variab3  POSITION(794:808)  CHAR,variab4  POSITION(809:814)  CHAR,variab5  POSITION(815:818)  CHAR,variab6  POSITION(819:918)  CHAR)

其中，我每次要针对不同的文件，把文件数据插入到不同的表，我都会读取ctl文件，修改其中的表名，然后再调用sqlldr命令。
3、c++处理sqlldr生成的bad文件，使之变成可以再次导入的文件.
头文件和cpp文件如下：
先是从网上找到的一个把gbk编码的字符数组转换成unicode编码的简单类：

// UtfFile.h: interface for the UtfFile class.///////////////////////////////////////////////////////////#if !defined(AFX_UTFFILE_H__8ED10D8A_D1A3_412F_A600_124F521CE4F1__INCLUDED_)#define AFX_UTFFILE_H__8ED10D8A_D1A3_412F_A600_124F521CE4F1__INCLUDED_#if _MSC_VER > 1000#pragma once#endif // _MSC_VER > 1000#include <windows.h> #include <stdio.h>#include <locale.h>#include <IOSTREAM>#include <FSTREAM>using namespace std;class UtfFile {public: static char * UnicodeToGB2312(WCHAR uData, char buffer[2]);  // Unicode 转换成 GB2312 static WCHAR * UTF_8ToUnicode(char *pText, WCHAR &unicode);  // 把UTF-8 转化成 Unicode static char * TranslateUTF8ToGB(char *str, size_t len);   // 把UTF-8字符串转化成ANSI（GB2312）编码形式 char * GetString(char *str, int maxLen = 1024);  // 读取一个字符串，以换行符为结束标示 char * GetLine(char *str, int maxLen);    // 读取一行字符 void close();          // 关闭文件流 int open(const char *sFileName);     // 用于打开一个文件 UtfFile(const char *sour); virtual ~UtfFile();public: ifstream inf;};#endif // !defined(AFX_UTFFILE_H__8ED10D8A_D1A3_412F_A600_124F521CE4F1__INCLUDED_)

该类的源文件：

/*-----------------------------------------------以下为.cpp文件内容------------------------------------------*/// UtfFile.cpp: implementation of the UtfFile class.////////////////////////////////////////////////////////////////////////#include "UtfFile.h"//////////////////////////////////////////////////////////////////////// Construction/Destruction//////////////////////////////////////////////////////////////////////UtfFile::UtfFile(const char *sour):inf(sour){ if( !inf.is_open() ) {  char str[1024];  sprintf(str, "错误的文件路径，文件无法打开：%s", sour);  throw runtime_error(str); }}UtfFile::~UtfFile(){}void UtfFile::close(){ inf.close();}char * UtfFile::GetLine(char *str, int maxLen){ if ( inf.eof() ) {  str[0] = '\0'; } else {  inf.getline(str, maxLen);  TranslateUTF8ToGB(str, maxLen); } return str;}char * UtfFile::GetString(char *str, int maxLen){ if ( inf.eof() ) {  str[0] = '\0'; } else {  inf >> str;  TranslateUTF8ToGB(str, maxLen); } return str;}char * UtfFile::UnicodeToGB2312(WCHAR uData, char buffer[2]){ WideCharToMultiByte(CP_ACP,NULL, &uData, 1,buffer,sizeof(WCHAR),NULL,NULL); return buffer;}WCHAR * UtfFile::UTF_8ToUnicode(char *pText, WCHAR &unicode){ /*    http://blog.csdn.net/liuzhiyuan1982/article/details/3911150UTF-8是一种多字节编码的字符集，表示一个Unicode字符时，它可以是1个至多个字节，在表示上有规律：1字节：0xxxxxxx2字节：110xxxxx 10xxxxxx3字节：1110xxxx 10xxxxxx 10xxxxxx4字节：11110xxx 10xxxxxx 10xxxxxx 10xxxxxx*/ char *uchar = (char *)&unicode;  uchar[1] = ((pText[0] & 0x0F) << 4) + ((pText[1] >> 2) & 0x0F); uchar[0] = ((pText[1] & 0x03) << 6) + (pText[2] & 0x3F); return &unicode;}char * UtfFile::TranslateUTF8ToGB(char *str, size_t len){ char * newCharBuffer = new char[len]; int index =0; int nCBIndex = 0; WCHAR wTemp = 0; char cTemp[2] = " "; while(index < len) {  if ( str[index] == 0 )   break;  else if(str[index] > 0)  // 如果是GB2312的字符  {   newCharBuffer[nCBIndex] = str[index];    //直接复制   index += 1;    //源字符串偏移量1   nCBIndex += 1;   //目标字符串偏移量1  }  else      //如果是UTF-8的字符  {   UTF_8ToUnicode(str + index, wTemp);   //先把UTF-8转成Unicode   UnicodeToGB2312(wTemp, &newCharBuffer[nCBIndex]); //再把Unicode 转成 GB2312   index += 3;    //源字符串偏移量3   nCBIndex += 2;   //目标字符串偏移量2  因为一个中文UTF-8占3个字节，GB2312占两个字节  } } newCharBuffer[nCBIndex] = '\0'; //结束符 strcpy( str, newCharBuffer ); delete newCharBuffer;  //避免内存泄漏，这是对源代码的稍许修改 newCharBuffer = NULL; return str;  }int UtfFile::open(const char *sFileName){ inf.open(sFileName); return inf.is_open();}

下面是我完成的main文件，大致就是对要处理的文件，读出一行，转码，替换里面的半中文字符‘？’，然后写入要输出的文件：

#include "UtfFile.h"#include<string>#include<vector>#include<io.h>#include<cstdio>using namespace std;void getFiles(string path, vector<string>& files){    //文件句柄    intptr_t   hFile = 0;    //文件信息    struct _finddata_t fileinfo;    string p;    if ((hFile = _findfirst(p.assign(path).append("\\*").c_str(), &fileinfo)) != -1)    {        do        {            //如果是目录,迭代之            //如果不是,加入列表            if ((fileinfo.attrib &  _A_SUBDIR))            {                if (strcmp(fileinfo.name, ".") != 0 && strcmp(fileinfo.name, "..") != 0)                    getFiles(p.assign(path).append("\\").append(fileinfo.name), files);            }            else            {                files.push_back(p.assign(path).append("\\").append(fileinfo.name));            }        } while (_findnext(hFile, &fileinfo) == 0);        _findclose(hFile);    }}char* G2U(const char* gb2312){    int len = MultiByteToWideChar(CP_ACP, 0, gb2312, -1, NULL, 0);    wchar_t* wstr = new wchar_t[len + 1];    memset(wstr, 0, len + 1);    MultiByteToWideChar(CP_ACP, 0, gb2312, -1, wstr, len);    len = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, NULL, 0, NULL, NULL);    char* str = new char[len + 1];    memset(str, 0, len + 1);    WideCharToMultiByte(CP_UTF8, 0, wstr, -1, str, len, NULL, NULL);    if (wstr) delete[] wstr;    return str;}void do_bad(string in_file, string out_file){    UtfFile  myfile(in_file.c_str());  //读bad文件    ofstream outfile;    outfile.open(out_file, ios::trunc); //写文件    string tempstr;    char* mystr;    while (std::getline(myfile.inf, tempstr))    {        mystr = const_cast<char*>(tempstr.c_str());        mystr = G2U(mystr);        UtfFile::TranslateUTF8ToGB(mystr, 918);        char* str_bk = mystr;        for (int j = 0; j<918; j++)//可用find函数优化        {            if (*mystr == '?') //replace             {                *mystr = '1';                break;            }            mystr++;        }        outfile << str_bk << '\n';//    }    outfile.close();    myfile.close();}int main(){  //char strbuffer[1024]={0};  //mystr=strbuffer;  //  char * filePath = "C:\\pythonfile\\sqlldr_bsibsa\\bad_process";  //\\bad_files  vector<string> files;  vector<string> in_files;  vector<string> out_files;  ////获取该路径下的所有文件  getFiles(filePath, files);  //BSA2017060701_01.bad  int total_size = files.size();  for (int i = 0; i < total_size; i++)  {       //cout << files[i] << endl;      int pos = files[i].find_last_of('\\');      string name_with_exp(files[i].substr(pos + 1));      string base_dir(files[i].substr(0,pos+1));      cout << name_with_exp.size() << endl;      if (name_with_exp.size() == 20)      {          in_files.push_back(files[i]);          //cout << s << endl;          string name_only = name_with_exp.substr(0, 16);         // char str[50];          cout << name_only << endl;         // sprintf(str,"%s.bad",name_only);         // cout << str << endl;          string full_path;          full_path = base_dir + name_only + ".dat";          //cout << full_path << endl;          out_files.push_back(full_path);      }  }  int input_size = in_files.size();  for (int i = 0; i < input_size; i++)  {      do_bad(in_files[i], out_files[i]);  }  //C:\\pythonfile\\sql_loader\\bad_test\\BSA2016010101.dat  //C:\\pythonfile\\sql_loader\\bad_test\\BSA2016010101.bad  //  //UtfFile  myfile("C:\\pythonfile\\sql_loader\\bad_test\\1.dat");  //读bad文件  return 0;}

4、c++程序处理好bad数据使之可以用之后，我又写了一个脚本，和专门的ctl文件，再次把这些bad文件批量导进去：

import osbasedir=os.getcwd()#print(basedir)files_list=os.listdir(basedir)bad_dir='%s\\badd_files'%basedirlog_dir='%s\\logg_files'%basedirif not os.path.exists(bad_dir):    os.mkdir(bad_dir)if not os.path.exists(log_dir):    os.mkdir(log_dir)succ=0for file_name in files_list:    if len(file_name) != 20:        continue    if ('BS'  in file_name) and ('dat' in file_name):        #print(file_name)        table_name=file_name[0:3]+'_'+file_name[3:11]        #print(table_name)        #newname = file_name[:-4] + '.dat'        #os.rename(file_name,newname)        lines=open('bsa_bad.ctl','r').readlines()        for k in range(len(lines)):            if 'INTO TABLE' in lines[k]:                lines[k]='INTO TABLE %s\n'%table_name        f=open('bsa_bad.ctl','w')        f.writelines(lines)        f.close()        file_base=file_name[0:16]        #print(file_base)            #import        cmd_str="sqlldr xxx/xxx@xxx/orcl control=%s\\bsa_bad.ctl data=%s "%(basedir,file_name)        cmd_str+="bad=%s\\badd_files\\%s.bad log=%s\\logg_files\\%s.log direct=true"%(basedir,file_base,basedir,file_base)        #print(cmd_str)        os.system(cmd_str)        succ+=1    else:        continueprint("succ:%d"%succ)

这个文件又生成了极小的bad文件，我分析了下，都是个别在源文件就是乱码的数据，可以分门别类留存下来，不必理会。

注意：其中oracle登陆远程服务器的用户名，密码，ip地址都被我用xxx代替了
欢迎提问！

阅读全文

1 0