orcal-day09-kattle之Spoon-oracle完成数据清洗

梁宪

2023-12-01

0.kattle由四个工具，都可以看一看

1.数据库连接：

2.数据转换：

spoon的几次调用：txt->excel,excel->csv,csv->oracle。从文本文档到格式化的文档，需要第一行做表头。
1. txt->excel注意分隔符，要和txt文本中使用的一直，是'逗号'还是'分号
2. 从csv到oracle，注意导入表前要进行建表语句。无论是在oracle还是在spoon中建表，都要注意大小和精度。
3. 在编辑输入的时候，要能读取到表结构，读出数据
4. 在编辑输出的时候，同样要指定正确的输出文件或输出表。同样要读出表结构，这是验证输入与输入是否通的关键。
5. END
获取另一个数据库中的表：首先要建立dblink，注意从其他数据库中读表的格式。
1. 建立dblink：
  -- 分配权限 grant dba to scott;
  -- 创建dblike create public database link orcl2 connect to scott identified by "123456" using '(DESCRIPTION = (ADDRESS = (PROTOCOL = TCP)(HOST = localhost)(PORT = 1521)) (CONNECT_DATA = (SERVER = DEDICATED) (SERVICE_NAME = orcl2) ) )';
2. 远程访问格式 select * from scott.dept@orcl2;
清洗的注意事项
1. 流程：定义包头，在包体写函数或存储过程，在定时器里用dbms_job.sumbit()调用函数或存储过程
2. 如果发生多行错位，可以使用CASE row1 WHEN exp1 THEN row2 END 把row2的错位数据填回row1位置去。
3. 如果要一行分多行，可以使用as将其分为多行，比如:
  substr('AB',1,1) as row_n,
  substr('AB',2,1) as row_n+1,
  在使用插入语句后，最重要的是你写的顺序要和清洗后数据存放表的顺序一致，而不是和where表的顺序一致。反而不需要关心where表顺序，想用哪个拿关键词提哪个。
4. 注意清洗空行，nvl() 或 nvl2()。出现空数据，对后期的分析是非常不好的
5. 注意对数字类变量提取数字，带有单位的字符串类型数据对数据分析是很不友好的。可以使用正则表达式，结合10个字符串函数使用。以下是五个正则函数
  1. REGEXP_LIKE
  2. REGEXP_INSTR ：返回指定字符串中与正则表达式匹配部分第一次出现的位置
  3. REGEXP_COUNT ：返回指定字符串中与正则表达式匹配部分出现的次数
  4. REGEXP_SUBSTR ：截取指定字符串中与正则表达式匹配的部分
  5. REGEXP_REPLACE ：替换指定字符串中与正则表达式匹配的部分
6. END
定时任务：定时任务的指令是 dbms_job.submit(),详细参数可以通过按住“ctrl”点击函数进入定义查看，一般而言只需要前四个参数。请通过“=>”指定函数传值，以下是四个参数的说明
1. job=> 这是jobid，是输出参数，可以在declare中定义一个变量接受此值，定义时不要赋初值。可以在表 user_jobs中查看对应id的job的调用情况，比如下一次调用时间，调用间隔。也可以通过dbms_job.run( id ) 立刻调用一次
2. what=> 此参数是字符串类型的，是调用某包某函数/过程的语句，不要丢失语句最后的分号；。
3. next_date=> 此参数指定首次执行的时间
4. interval=>次参数指定执行间隔，也是字符串类型的，里边再需要指定函数中字符串类型的输入时，使用四个单引号括起来'' ''。
5. END
书写PL/SQL语句，在进行if,case,loop等语句时，一定要先把结束结构写完整了。不然插入语句几十条，再去对哪个对哪个，就很困难了。

最后丢一段代码

--1.先读到数据
select * from scott.gzesf@orcl2;
--2.设计清洗后的表
create table gzesf as 
select '广东省' "省",
       '广州市' "市",
       area,
       street,
       community,
       introduction,
       room_type,
       acreage,
       price,
       unit_price,
       toward,
       decoration,
       elevator,
       floor,
       year,
       '指定房屋类型' "type",
       follow,
       visit,
       release_time,
       url from gzesf@orcl2 e
       where 1=2;
select * from gzesf@orcl2 old_g where old_g.room_type = '独栋别墅';
--3.实现部分，外部测试，做对应的查询
insert into gzesf 
       select
       '广东省' 省,   --设置对应的省
       '广州市' 市,   --设置对应的市
       area,        --无需修改
       street,      --无需修改
       community,   --无需修改
       case room_type when '独栋别墅' then introduction||room_type--需要将 room_type为 独栋别墅 列的错误信息 装入此列
       else   introduction end
         as  introduction,
 --      case room_type when '独栋别墅' then acreage--修改独栋别墅错列
 --        else  room_type end as room_type,
                 room_type,
        --case room_type when '独栋别墅' then to_char(trunc(price*10000/unit_price))--修改独栋别墅错列
        case room_type when '独栋别墅' then regexp_replace(toward,'[^0-9]')--修改独栋别墅错列
        else to_char(nvl(regexp_replace(acreage, '[^0-9.]',''),0)) end as acreage,--to_char
        price,
        unit_price,
        case room_type when '独栋别墅' then decoration --修改独栋别墅的错列
        else toward end as toward,
        case room_type when '独栋别墅' then '其他' --修改独栋别墅的错列
        else decoration end as decoration,
        nvl(elevator,'无电梯') as elevator,--去除空值
        regexp_replace(floor,'[^0-9]','') as floor,
        nvl(regexp_replace(year,'[^0-9]',''),
            (select median(to_number(regexp_replace(year,'[^0-9]',''))) from scott.gzesf@orcl2)) as year,--year的去空以及中位数
        replace(regexp_replace(year,'[0-9]'),'年建','') as type, --对year拆分的第二行
        regexp_replace(follow,'[^0-9]','') as follow,
        regexp_replace(visit,'[^0-9]','') as visit,
        release_time,
        url from scott.gzesf@orcl2
        --where room_type = '独栋别墅';
        where price in (350,1500);
--4.设计包和实现
CREATE OR REPLACE PACKAGE p_gzesf --包头
       is
       procedure clear_and_update_gezsf;
end p_gzesf;
-------包体，实现--------------------------------------------------------------------------
CREATE OR REPLACE PACKAGE body p_gzesf--包体，实现
       is
       procedure clear_and_update_gezsf
         is
       no_date exception; 
       v_num number;
       begin
         select count(*) into v_num  from gzesf where rownum = 1;
         if v_num > 0 then --查询，若表内有数据则先清空
           delete from gzesf;             
         end if;
         select count(*) into v_num  from gzesf where rownum = 1;
         if v_num = 0 then
           select count(*) into v_num from scott.gzesf@orcl2 where rownum = 1;
           if v_num = 0 then
             raise no_date;
           else --符合条件，开始插入数据
       insert into gzesf 
       select
       '广东省' 省,   --设置对应的省
       '广州市' 市,   --设置对应的市
       area,        --无需修改
       street,      --无需修改
       community,   --无需修改
       case room_type when '独栋别墅' then introduction||room_type--需要将 room_type为 独栋别墅 列的错误信息 装入此列
       else   introduction end as  introduction,
       case room_type when '独栋别墅' then acreage--修改独栋别墅错列
         else  room_type end as room_type,                
        case room_type when '独栋别墅' then regexp_replace(toward,'[^0-9]')--修改独栋别墅错列
        else to_char(nvl(regexp_replace(acreage, '[^0-9.]',''),0)) end as acreage,--to_char
        price,
        unit_price,
        case room_type when '独栋别墅' then decoration --修改独栋别墅的错列
        else toward end as toward,
        case room_type when '独栋别墅' then '其他' --修改独栋别墅的错列
        else decoration end as decoration,
        nvl(elevator,'无电梯') as elevator,--去除空值
        regexp_replace(floor,'[^0-9]','') as floor,
        nvl(regexp_replace(year,'[^0-9]',''),
            (select median(to_number(regexp_replace(year,'[^0-9]',''))) from scott.gzesf@orcl2)) as year,--year的去空以及中位数
        replace(regexp_replace(year,'[0-9]'),'年建','') as type, --对year拆分的第二行
        regexp_replace(follow,'[^0-9]','') as follow,
        regexp_replace(visit,'[^0-9]','') as visit,
        release_time,
        url from scott.gzesf@orcl2;
           --插入结束，请查看
           end if; 
           end if;  
                                 
         exception
           when no_date then 
              dbms_output.put_line('源表gzesf没有数据，请查看！'); 
           when others then
           dbms_output.put_line(sqlcode||'-------'||sqlerrm);
       end clear_and_update_gezsf;
begin
  null;
end p_gzesf;
--5.测试
begin
  p_gzesf.clear_and_update_gezsf;
end;

select count(*) from gzesf;

delete from gzesf;
--6.定时
declare
       v_jobID number;
begin
  dbms_job.submit ( job=>v_jobID ,
                    what=>'p_gzesf.clear_and_update_gezsf;',
                    next_date=> sysdate + 1/(24*60) ,
                    interval=>'trunc(sysdate,''mi'')+1/(24)');--这个提取分钟的mi 是由四个单引号包围的
  commit;
end;
--也可以直接调用一次job
dbms_job.run(23);

select * from user_jobs;

endHere

orcal-day09-kattle之Spoon-oracle完成数据清洗

相关阅读

相关文章

相关问答

相关文档