knitr::opts_chunk$set(warning = FALSE, message = FALSE)
更新
messagetime记录错误的是主营短信,目前我看了信用精灵的短信,没有记错的情况
library(RODBC)
impala <- odbcConnect("Impala")
sqlQuery(impala,"
select
sum(
cast(now() as string) < messagetime
)
,avg(
cast(now() as string) < messagetime
)
from opd.t181115_subsetxyjlSms_ljx
")
sum(cast(now() as string) < messagetime) avg(cast(now() as string) < messagetime)
1 0 0
这里的翻译都有问题 在 impala中
[.]
-> .
: char_class = F
[\d]
-> \\d
: as.character()
查看library(rebus)
library(tidyverse)
'【' %R%
one_or_more(ANY_CHAR,char_class = F) %R%
'】'%R%
one_or_more(ANY_CHAR,char_class = F) %R%
or('http'
,'t' %R%
or('/','.') %R%
'cn'
) %R%
one_or_more(or('/','.')) %R%
optional(SPC,char_class = F) %R%
'验证码' %R%
optional(or(':',':'),char_class = F) %R%
repeated(DGT,1,4,char_class = F) %>%
as.character()
## [1] "【.+】.+(?:http|t(?:/|.)cn)(?:/|.)+\\s?验证码(?::|:)?\\d{1,4}"
-- 正则化验证
select regexp_like(
'【快贷】新产品,5000元额度已入您的账户,3日有效,请尽快查收!戳 https://12i.cn/GdEnsq 验证码:1退订回T'
,'【.+】.+(?:http|t(?:/|.)cn)(?:/|.)+\\s?验证码(?::|:)?\\d{1,4}'
)
select
sum(
regexp_like(content,'http|t\\.cn')
and regexp_like(content,'退订')
and regexp_like(content,'【.+】.+(?:http|t(?:/|.)cn)(?:/|.)+\\s?验证码(?::|:)?\\d{1,4}')
)
/sum(
regexp_like(content,'http|t\\.cn')
and regexp_like(content,'退订')
)
from opd.t181204_smsAddCompany_ljx
-- 0.008758495443115662
-- 比例很少
rebus 使用举例
library(rebus)
optional('(') %R%
repeated(DGT,3) %R%
optional(')') %R%
char_class("-.() ") %R%
repeated(DGT,3) %R%
char_class("-.() ") %R%
repeated(DGT,4)
## <regex> [(]?[\d]{3}[)]?[-.() ][\d]{3}[-.() ][\d]{4}
one_or_more(ANY_CHAR) %R%
'验证码' %R%
# or(DGT,repeated(DGT,6),repeated(DGT,8))
or(repeated(DGT,1,6),repeated(DGT,8))
## <regex> [.]+验证码(?:[\d]{1,6}|[\d]{8})
SELECT
count(1)
,b.company_clean
FROM xyjl.t181115_subsetmainsms_ljx a
INNER JOIN [shuffle] xyjl.wd_7 b
ON a.content= b.content
WHERE messagetime>='2018-09-01'
AND messagetime<='2018-11-30'
group by
select*
from
(select *
from xyjl.t181115_subsetmainsms_ljx
where messagetime>='2018-05-30' and messagetime<='2018-11-30'
--and convert(char(8),messagetime,108)>='22:30:00' and convert(char(8),messagetime,108)<='23:00:00'
order by user_id) a
where content like '%注册%'
limit 1000
-- 加xyjl 库名
-- from (...) 加 alias,无论是with 还是 from 格式
group by + count
with a as (
select 1 as x1 union all
select 2 as x1 union all
select 3 as x1 union all
select 4 as x1 union all
select 5 as x1 union all
select 3 as x1
)
select count(1), x1
from a
group by 2