Skip to content

Instantly share code, notes, and snippets.

@itsderek23
Created July 10, 2019 19:15

Revisions

  1. itsderek23 created this gist Jul 10, 2019.
    19 changes: 19 additions & 0 deletions impression_outliers_print.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,19 @@
    from sklearn.ensemble import IsolationForest

    def print_anomalies(query,column):
    df_anom = df[(df['query'] == query) & (df['device'] == 'desktop')]
    x=df_anom[column].values
    xx = np.linspace(df_anom[column].min(), df_anom[column].max(), len(df)).reshape(-1,1)

    isolation_forest = IsolationForest(n_estimators=100)
    isolation_forest.fit(x.reshape(-1, 1))

    anomaly_score = isolation_forest.decision_function(xx)
    # 1 = inlier, 0 = outlier
    outlier = isolation_forest.predict(xx)
    df_outliers = df_anom[list(map(lambda v: True if v < 0 else False,isolation_forest.predict(x.reshape(-1, 1))))]
    df_outliers = df_outliers[df_outliers.date >= df.date.max() - datetime.timedelta(days=14)]
    print(df_outliers)

    for q in top_queries_by_clicks:
    print_anomalies(q,'impressions')