目次
使うデータセット
Firebaseのパブリックデータが以下にあるのでこれを使う。
firebase-public-project.analytics_153293282.events_*
0612~1003の114日分のデータ
基本集計(EDA)
イベントの数の種類別カウント
select
event_name,
count(0)
from `firebase-public-project.analytics_153293282.events_20181003`
group by 1
order by 2 desc;
スクリーンビューのスクリーン別カウント
select
(select ep.value.string_value from unnest(event_params) ep where ep.key = 'firebase_screen_class'),
count(0) screen_views
from `firebase-public-project.analytics_153293282.events_20181003`
where event_name = 'screen_view'
group by 1
order by 1;
ユーザ別イベント数カウント
行がユーザ、列がイベント名。
イベント名が動的なのでBigQuery Scriptingを使う。
declare event_pivot_columns string;
set event_pivot_columns = (
select string_agg(distinct 'sum(case when event_name = "' || event_name || '" then 1 else 0 end) ' || event_name) from `firebase-public-project.analytics_153293282.events_*`
);
execute immediate format("""
create or replace table `my-project.test.user_stats` as
select
user_pseudo_id,
min(timestamp_micros(event_timestamp)) first_open_timestamp,
max(timestamp_micros(event_timestamp)) last_open_timestamp,
count(0) total_events,
max(user_ltv.revenue) total_revenue,
%s
from `firebase-public-project.analytics_153293282.events_*`
group by 1
order by 4 desc;
""", event_pivot_columns)
対象データを抽出してフラット化
1行は1イベントで、生ログからイベントパラメータやユーザープロパティをフラット化(UNNEST()
)し、列にする。パラメータやプロパティは動的なのでBigQuery Scriptingを使う。
declare ep_columns string;
declare up_columns string;
set ep_columns = (
with t1 as (
select
p.key key,
sum(case when p.value.string_value is not null then 1 else 0 end) cnt_string,
sum(case when p.value.float_value is not null or p.value.double_value is not null then 1 else 0 end) cnt_float64,
sum(case when p.value.int_value is not null then 1 else 0 end) cnt_int64
from `firebase-public-project.analytics_153293282.events_*`, unnest(event_params) p
group by 1
), t2 as (
select
key,
case
when cnt_string > 0 then 'string'
when cnt_float64 > 0 and cnt_int64 > 0 then 'numeric'
when cnt_float64 > 0 then 'float64'
when cnt_int64 > 0 then 'int64'
else 'string'
end type
from t1
)
select
string_agg(
'(select case when p.key = "'
|| key
|| '" then case when p.value.string_value is not null then safe_cast(p.value.string_value as '
|| type
|| ') when p.value.int_value is not null then safe_cast(p.value.int_value as '
|| type
|| ') when p.value.float_value is not null then safe_cast(p.value.float_value as '
|| type
|| ') when p.value.double_value is not null then safe_cast(p.value.double_value as '
|| type
|| ') else null end else null end from unnest(event_params) p where p.key = "'
|| key
|| '") e_'
|| key
order by key
)
from t2
);
set up_columns = (
with t1 as (
select
p.key key,
sum(case when p.value.string_value is not null then 1 else 0 end) cnt_string,
sum(case when p.value.float_value is not null or p.value.double_value is not null then 1 else 0 end) cnt_float64,
sum(case when p.value.int_value is not null then 1 else 0 end) cnt_int64
from `firebase-public-project.analytics_153293282.events_*`, unnest(user_properties) p
group by 1
), t2 as(
select
key,
case
when cnt_string > 0 then 'string'
when cnt_float64 > 0 and cnt_int64 > 0 then 'numeric'
when cnt_float64 > 0 then 'float64'
when cnt_int64 > 0 then 'int64'
else 'string'
end type
from t1
)
select
string_agg(
'(select case when p.key = "'
|| key
|| '" then case when p.value.string_value is not null then safe_cast(p.value.string_value as '
|| type
|| ') when p.value.int_value is not null then safe_cast(p.value.int_value as '
|| type
|| ') when p.value.float_value is not null then safe_cast(p.value.float_value as '
|| type
|| ') when p.value.double_value is not null then safe_cast(p.value.double_value as '
|| type
|| ') else null end else null end from unnest(user_properties) p where p.key = "'
|| key
|| '") u_'
|| key
order by key
)
from t2
);
execute immediate format("""
create or replace table `my-project.test.log_filtered` partition by date(event_timestamp) as
with t1 as (
select
user_pseudo_id,
min(timestamp_micros(event_timestamp)) over(partition by user_pseudo_id) first_open_timestamp,
timestamp_micros(event_timestamp) event_timestamp,
event_name,
%s, %s,
device.category device_category,
device.operating_system,
device.operating_system_version,
case when device.browser is not null then device.browser else device.web_info.browser end browser,
case when device.browser_version is not null then device.browser_version else device.web_info.browser_version end browser_version,
platform,
app_info.install_source
from `firebase-public-project.analytics_153293282.events_*`
where user_pseudo_id in (select distinct user_pseudo_id from `firebase-public-project.analytics_153293282.events_*` where event_name = 'first_open')
), t2 as (
select *, timestamp_diff(event_timestamp, first_open_timestamp, second) seconds_since_first_open from t1
)
select * from t2
""", ep_columns, up_columns);
機械学習用データセットを作る
1か月後に継続しているか
→利用開始から29日目~35日目にスクリーンビューがあれば継続とみなして、利用開始から7日間のアクティビティに基づいて継続の有無を予測する
1行1ユーザで、利用開始から7日間のアクティビティを集計(イベント名やスクリーンクラス別カウント、エンゲージメント時間など)。特徴量化するイベントやスクリーンクラスは出現頻度1%以上に絞り込む。
-- screen_classごとのスクリーンビュー数
declare screen_class_columns string;
declare event_name_columns string;
create temp table screen_classes as
with t1 as (
select
e_firebase_screen_class, count(0) cnt
from `my-project.test.log_filtered`
where event_name = 'screen_view'
group by 1
order by 1
)
select *, cnt/sum(cnt) over() ratio from t1 order by 2 desc
;
-- 出現頻度が1%より大きいものをカウント対象にする
set screen_class_columns = (select
string_agg('sum(case when e_firebase_screen_class = "' || e_firebase_screen_class || '" and event_name = "screen_view" then 1 else 0 end) screen_views_' || e_firebase_screen_class)
from screen_classes where ratio > 0.01
);
-- event_nameごとのイベント数
create temp table event_names as
with t1 as (
select
event_name, count(0) cnt
from `my-project.test.log_filtered`
where event_name not in ('screen_view', 'user_engagement', 'first_open')
group by 1
order by 1
)
select *, cnt/sum(cnt) over() ratio from t1 order by 2 desc
;
-- 出現頻度が1%より大きいものをカウント対象にする
set event_name_columns = (select
string_agg('sum(case when event_name = "' || event_name || '" then 1 else 0 end) event_count_' || event_name)
from event_names where ratio > 0.01
);
execute immediate format("""
create or replace table `my-project.test.sum_by_user` as
with t1 as (
select
user_pseudo_id,
min(extract(dayofweek from first_open_timestamp)) first_open_dow,
min(platform) platform,
min(install_source) install_source,
sum(case when seconds_since_first_open <= 1*24*60*60 and event_name = 'screen_view' then 1 else 0 end) screen_views_first_1d,
sum(case when seconds_since_first_open <= 1*24*60*60 and e_engagement_time_msec is not null then e_engagement_time_msec else 0 end)/1000 engagement_time_seconds_first_1d,
sum(case when event_name = 'screen_view' then 1 else 0 end) screen_views,
sum(case when e_engagement_time_msec is not null then e_engagement_time_msec else 0 end)/1000 engagement_time_seconds,
%s, %s
from `my-project.test.log_filtered`
where seconds_since_first_open <= 7*24*60*60
group by 1
), t2 as (
select
user_pseudo_id,
sum(case when event_name = 'screen_view' then 1 else 0 end) screen_views_29d_35d
from `my-project.test.log_filtered`
where seconds_since_first_open > 28*24*60*60 and seconds_since_first_open <= 35*24*60*60
group by 1
), t3 as (
select
t1.*,
ifnull(screen_views_29d_35d, 0) screen_views_29d_35d,
rand() rnd
from t1 left join t2 using(user_pseudo_id)
), t4 as (
select
* except(screen_views_29d_35d, rnd),
screen_views_29d_35d > 0 active_after_4w,
case
when rnd < 0.8 then 'TRAIN'
when rnd < 0.9 then 'EVALUATE'
else 'TEST'
end subset
from t3
)
select * from t4
""", screen_class_columns, event_name_columns);
BQML
-- 学習
CREATE OR REPLACE MODEL `my-project.test.model_predict_active_users_xgb`
OPTIONS(
MODEL_TYPE='BOOSTED_TREE_CLASSIFIER',
MAX_ITERATIONS=100,
LEARN_RATE=0.1,
SUBSAMPLE=0.8,
INPUT_LABEL_COLS=['active_after_4w']
) AS
select * except(user_pseudo_id, subset)
from `my-project.test.sum_by_user`
where subset = 'TRAIN';
-- 評価
SELECT * FROM ML.EVALUATE(MODEL `my-project.test.model_predict_active_users_xgb`, (
select * except(user_pseudo_id, subset)
from `my-project.test.sum_by_user`
where subset = 'EVALUATE'
));
-- 予測
with predictions as (
SELECT * FROM ML.PREDICT(MODEL `my-project.test.model_predict_active_users_xgb`, (
select * except(subset)
from `my-project.test.sum_by_user`
where subset = 'TEST'
))
)
select
user_pseudo_id
active_after_4w,
(select p.prob from unnest(predictions.predicted_active_after_4w_probs) p where p.label = true) prob
from predictions
order by prob desc;
[公開日:2021年2月12日]
データ分析 の記事一覧