统计信息准确性校验【绽放吧!GaussDB(DWS)云原生数仓】

举报
yd_249256431 发表于 2023/10/24 09:52:42 2023/10/24
【摘要】 当前OLAP数据库查询性能对客户起着至关重要的作用,而查询性能的好坏之一在于统计信息的准确性,本文针对统计信息的准确性给出校验方法,供大家参考。

   当前OLAP数据库查询性能对客户起着至关重要的作用,而查询性能的好坏之一在于统计信息的准确性,本文针对统计信息的准确性给出校验方法,供大家参考。


一:创建基表并收集统计信息

1,创建基表导入数据

drop table if exists tmptable;
create table  tmptable(
i_int int,
i_text text
);

insert into tmptable values(generate_series(1,10),'testabc');
insert into tmptable values(generate_series(1,10),'testbcd');
insert into tmptable values(generate_series(10,20),'testbcd');
insert into tmptable values(generate_series(10,20),'testcde');

2,创建表达式统计信息

create  STATISTICS s1 on  btrim(i_text,'test') from tmptable;

3,表analyze

analyze tmptable;

4,查看统计信息

     表reltuples

     select reltuples,relpages from pg_class where relname='tmptable' ;

   列统计信息

   select tablename,n_distinct,most_common_vals,most_common_freqs from pg_stats where tablename='tmptable';

  表达式统计信息

   select tablename,statistics_name,expr, n_distinct,most_common_vals, most_common_freqs   from pg_stats_ext_exprs where tablename='tmptable';

二:校验统计信息方式如下:

1,校验列的distinct

--------------------------------------------------列的distinct 估算值校验--------------------------------------------------

create or replace  function test_distinct(in tablename1 text,in attname1 text)
returns table(a_distinct numeric,e_distinct numeric,percent numeric)
as $$ 
declare
 a numeric;
 b numeric;
 sql_1 text;
BEGIN
sql_1:='select n_distinct from pg_stats where tablename = '''|| tablename1 || ''' and attname ='''|| attname1 || ''';';
EXECUTE IMMEDIATE sql_1 into e_distinct;
sql_1:='select count(distinct '||attname1||') from '||tablename1;
EXECUTE IMMEDIATE sql_1 into a;
sql_1:='select count(*) from '||tablename1||' where  '||attname1||' is not null ';
EXECUTE IMMEDIATE sql_1 into  b ;
if e_distinct<0 then
a_distinct =a/b;
ELSE
a_distinct=a;
end if;
percent=a_distinct/e_distinct;
return query select a_distinct,e_distinct,percent;
end;
$$
LANGUAGE plpgsql;

--参数:表名,列名
--返回 :真实值,估算值,准确率(越接近1表示越准确)
select test_distinct('tmptable','i_int');

2,校验mcf值估算

--------------------------------------mcf值估算校验---------------------------------- 

create or replace  function test_mcf2(in tablename1 text,in attname1 text)
returns table(col int,a_f numeric ,e_f numeric,percent numeric)
as $$ 
declare
 a bigint;
 b numeric;
 sql_1 text;
 sql_0 text;
BEGIN
 sql_0:='select count(*)  from '|| tablename1 || '';
  EXECUTE sql_0 into a;
  
sql_1:='with cte1 as
 (select regexp_split_to_table(array_to_string(most_common_vals, '',''), '','') as mcv,
         regexp_split_to_table(array_to_string(most_common_freqs, '',''), '','') as most_common_freqs,
         attname
    from pg_stats
   where tablename = '''|| tablename1 || '''
     and attname = '''|| attname1 || ''')
select '||attname1||',mcf::numeric as a_f ,most_common_freqs::numeric as  e_f ,(mcf/most_common_freqs)::numeric as rate
  from (select count(*) / '|| a || ' as mcf, '||attname1||'
          from '|| tablename1 || '
          join cte1
            on '|| tablename1 || '.'||attname1||' = cte1.mcv
         group by '||attname1||') tb1
  join cte1
    on tb1.'||attname1||' = cte1.mcv ;';
return query EXECUTE sql_1 ;

end;
$$
LANGUAGE plpgsql;

--参数:表名,列名
--返回值:mcv值,实际mcf,估算mcf,准确率(越接近1表示越准确)
select test_mcf2('tmptable','i_int');

3,表达式统计信息distinct值估算

-------------------------表达式统计信息distinct准确性计算-------------------------------

create or replace  function test_exp_distinct(in tablename1 text,in statistics_name1 text)
returns table(a_distinct numeric,e_distinct numeric,percent numeric)
as $$ 
declare
 a numeric;
 b numeric;
 sql_1 text;
 expr1 text;
 expr2 text;
BEGIN
expr2:='select expr  from pg_stats_ext_exprs where tablename='''|| tablename1 || ''' and statistics_name ='''|| statistics_name1 || ''' ;';
EXECUTE IMMEDIATE expr2 into expr1;
sql_1:='select n_distinct from pg_stats_ext_exprs where tablename = '''|| tablename1 || ''' and statistics_name ='''|| statistics_name1 || ''';';
EXECUTE IMMEDIATE sql_1 into e_distinct;
sql_1:='select count(distinct '||expr1||') from '||tablename1;
EXECUTE IMMEDIATE sql_1 into a;
sql_1:='select count(*) from '||tablename1||' where  '||expr1||' is not null ';
EXECUTE IMMEDIATE sql_1 into  b ;
if e_distinct<0 then
a_distinct =a/b;
ELSE
a_distinct=a;
end if;
percent=a_distinct/e_distinct;
return query select a_distinct,e_distinct,percent;
end;
$$
LANGUAGE plpgsql;

 
--参数:表名,表达式统计信息名
--出参,实际值,估算值,准确率(越接近1表示越准确)
select test_exp_distinct('tmptable','s1');

4,校验表达式统计信息mcf估算


-------------------------表达式统计信息mcf准确性计算-------------------------------
create or replace  function test_exp_mcf(in tablename1 text,statistics_name1 text)
returns table(tablename text,col text,a_f numeric ,e_f numeric,percent numeric)
as $$ 
declare
 a bigint;
 b numeric;
 sql_1 text;
 sql_0 text;
 exprtext text;
 expr1 text;
BEGIN
 expr1:='select expr  from pg_stats_ext_exprs where tablename='''|| tablename1 || ''' and statistics_name ='''|| statistics_name1 || ''' ;';
 EXECUTE IMMEDIATE expr1 into exprtext;
 sql_0:='select count(*)  from '|| tablename1 || '';
 EXECUTE sql_0 into a;
sql_1:='with cte1 as
 (select regexp_split_to_table(array_to_string(most_common_vals, '',''), '','') as mcv,
         regexp_split_to_table(array_to_string(most_common_freqs, '',''), '','') as most_common_freqs,
         statistics_name,expr
    from pg_stats_ext_exprs
   where tablename = '''|| tablename1 || '''
     and statistics_name = '''||statistics_name1||''')
select '''|| tablename1 || ''',mcv,a_mcf::numeric,most_common_freqs::numeric as e_mcf,most_common_freqs/a_mcf::numeric  as rate
  from (select count(*) / '|| a || ' as a_mcf, expr,'||exprtext||'as c1
          from '|| tablename1 || '
          join cte1
            on '||exprtext||'  = cte1.mcv group by mcv,expr,'||exprtext||'
         ) tb1
  join cte1 
    on c1=cte1.mcv ;
';
return query EXECUTE sql_1 ;

end;
$$
LANGUAGE plpgsql;

----入参:表名,表达式统计信息名
--出参,实际值,估算值,准确率(越接近1表示越准确)
select test_exp_mcf('tmptable','s1');

 我正在参加【有奖征文 第27期】绽放吧!GaussDB(DWS)云原生数仓!https://bbs.huaweicloud.cn/blogs/412962

【版权声明】本文为华为云社区用户原创内容,转载时必须标注文章的来源(华为云社区)、文章链接、文章作者等基本信息, 否则作者和本社区有权追究责任。如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱: cloudbbs@huaweicloud.com
  • 点赞
  • 收藏
  • 关注作者

评论(0

0/1000
抱歉,系统识别当前为高风险访问,暂不支持该操作

全部回复

上滑加载中

设置昵称

在此一键设置昵称,即可参与社区互动!

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。