验证猜想:

想做一个报文攻击类别分类器是一个比较棘手的问题,今天想用SVM跑一下3分类,分别是正常报文,XSS报文,SQL报文。在尝试使用rbf核函数的时候,发现正常样本被分类到XSS中,换成linear就好了,大雾。

由于后面的攻击检测引擎是SVM2分类,所以多分类之后再2分类验证的意义不太大,毕竟都是SVM也没啥好比较的,直接单引擎写一个小小的demo验证一下思路。

首先python文件跑通了,然后ctest跑通了,最后Nginx也跑通了,看上去一个小小的demo就出来了,算是迈出了基于机器学习的WAF模型探究的第一步,后续还需要研究Nginx如何输送报文给我们的引擎。

AB压力测试结果:

nginx worker进程1个的结果如下,如果worker进程多点,数据会好看点。因为这个demo是对每一个HTTP请求都检查,不知道是好还是坏,隐约记得modsecurity只检查部分HTTP请求,好像是用户的请求。

Document Path:          /
Document Length:        7093 bytes

Concurrency Level:      100
Time taken for tests:   26.555 seconds
Complete requests:      10000
Failed requests:        0
Total transferred:      72450000 bytes
HTML transferred:       70930000 bytes
Requests per second:    376.57 [#/sec] (mean)
Time per request:       265.554 [ms] (mean)
Time per request:       2.656 [ms] (mean, across all concurrent requests)
Transfer rate:          2664.31 [Kbytes/sec] received

模型结构:

├── a.out
├── config
├── data
│   ├── good-10000.txt
│   ├── good.txt
│   ├── sql-10000.txt
│   └── xss-200000.txt
├── model
│   └── waf.pkl
├── ngx_http_aisecurity_module.c
├── run.sh
├── test.c
├── waf.c
├── waf.h
├── waf.h.gch
├── waf.py
└── waf.pyc

部分代码:

// ngx_http_aisecurity_module.c by ailx10
#include "waf.h"
#include <ngx_config.h>
#include <ngx_core.h>
#include <ngx_http.h>

static ngx_int_t ngx_http_aisecurity_handler(ngx_http_request_t *r);
static ngx_int_t ngx_http_aisecurity_init(ngx_conf_t *cf);
static void *ngx_http_aisecurity_create_main_conf(ngx_conf_t *cf);
char *ngx_str_to_char(ngx_str_t a, ngx_pool_t *p);
const char** get_data(const char* uri,const char* method);
void free_data(const char** pkt_addr);

typedef struct {
  PyObject* pEngine;
  PyObject* pModule;
  ngx_flag_t enable;
  void * pool;
} ngx_http_aisecurity_conf_t;

ngx_int_t aisecurity_process_uri(ngx_http_aisecurity_conf_t* cf,const char** pkt_loads);
static ngx_command_t ngx_http_aisecurity_commands[] = {
   {
    ngx_string("aisecurity"),
    NGX_HTTP_MAIN_CONF|NGX_CONF_TAKE1,
    ngx_conf_set_flag_slot,
    NGX_HTTP_MAIN_CONF_OFFSET,
    offsetof(ngx_http_aisecurity_conf_t, enable),
    NULL,
   },
   ngx_null_command
};
 
static ngx_http_module_t ngx_http_aisecurity_module_ctx = {
    NULL,
    ngx_http_aisecurity_init,
    ngx_http_aisecurity_create_main_conf,
    NULL,
    NULL,
    NULL,
    NULL,
    NULL
};
 
ngx_module_t ngx_http_aisecurity_module = {
    NGX_MODULE_V1,
    &ngx_http_aisecurity_module_ctx,
    ngx_http_aisecurity_commands,
    NGX_HTTP_MODULE,
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    NGX_MODULE_V1_PADDING
};

static ngx_int_t ngx_http_aisecurity_init(ngx_conf_t *cf)
{
    ngx_http_handler_pt *h_rewrite;
    ngx_http_core_main_conf_t *cmcf;
    cmcf = ngx_http_conf_get_module_main_conf(cf, ngx_http_core_module);
    if (cmcf == NULL)
    {
        return NGX_ERROR;
    }
    h_rewrite = ngx_array_push(&cmcf->phases[NGX_HTTP_REWRITE_PHASE].handlers);
    if (h_rewrite == NULL)
    {
        return NGX_ERROR;
    }
    *h_rewrite = ngx_http_aisecurity_handler;
    return NGX_OK;
}

static void *ngx_http_aisecurity_create_main_conf(ngx_conf_t *cf)
{
    ngx_http_aisecurity_conf_t *conf = (ngx_http_aisecurity_conf_t  *)
    ngx_pcalloc(cf->pool, sizeof(ngx_http_aisecurity_conf_t));
    if (conf == NULL || conf == NGX_CONF_ERROR) {
        return NGX_CONF_ERROR;
    }
    PyObject* pModule = aisec_waf_init();
    PyObject* pEngine = aisec_waf_load(pModule);
    conf->pModule = pModule; 
    conf->pEngine = pEngine;
    conf->enable = NGX_CONF_UNSET;
    conf->pool = cf->pool;
    printf("(nginx)%p\t%p\n",pModule,pEngine);
    return conf;
}

static ngx_int_t ngx_http_aisecurity_handler(ngx_http_request_t *r)
{
    ngx_int_t atk = 0;
    ngx_int_t ret = 0;
    ngx_http_aisecurity_conf_t* cf;
    cf = ngx_http_get_module_main_conf(r, ngx_http_aisecurity_module);
    if(cf->enable != 1) return NGX_DECLINED;
    const char* uri = ngx_str_to_char(r->unparsed_uri,r->pool);
    const char* method = ngx_str_to_char(r->method_name,r->pool);
    const char** pkt_addr = get_data(uri,method);
    atk = aisecurity_process_uri(cf,pkt_addr);
    free_data(pkt_addr);
    if(atk)
        ret = NGX_HTTP_FORBIDDEN;
    else
        ret = NGX_DECLINED;
    return ret;
}

const char** get_data(const char* uri,const char* method)
{
    char** pkt_addr = (char**)malloc(2*sizeof(char*));
    pkt_addr[0] = (char*)uri;
    pkt_addr[1] = (char*)method;
    printf("pkt:\n%s\n%s\n",pkt_addr[0],pkt_addr[1]);
    return (const char**)pkt_addr;
}

void free_data(const char** pkt_addr)
{
    free(pkt_addr);
    return;
}

ngx_int_t aisecurity_process_uri(ngx_http_aisecurity_conf_t* cf,const char** pkt_loads)
{
    ngx_int_t ret = 0;
    ret = aisec_waf_predict(cf->pModule,cf->pEngine,pkt_loads);
    return ret;
}

ngx_inline char *ngx_str_to_char(ngx_str_t a, ngx_pool_t *p)
{
    char *str = NULL;
    if (a.len == 0) {
        return NULL;
    }
    str = ngx_pnalloc(p, a.len+1);
    if (str == NULL) {
        return (char *)-1;
    }
    ngx_memcpy(str, a.data, a.len);
    str[a.len] = '\0';
    return str;
}

// waf.c by ailx10
#include "waf.h"

PyObject* aisec_waf_init()
{
    Py_Initialize();
    PyRun_SimpleString("import sys");
    PyRun_SimpleString("sys.path.append('/home/nginx/aisecurity-0.2')");
    PyObject* moduleName = PyString_FromString("waf");
    PyObject* pModule = PyImport_Import(moduleName);
    if (!pModule)
    {
        printf("[C++] Python get module failed.\n");
        exit(0);
    }
    printf("[C++]waf load pModule ok :%p\n",pModule);
    return pModule;
}

PyObject* aisec_waf_load(PyObject* pModule)
{
    PyObject* waf_load = PyObject_GetAttrString(pModule,"waf_load");
    if (!waf_load || !PyCallable_Check(waf_load))
    {
        printf("[C++] Can't find funftion (waf_load).\n");
        exit(0);
    }
    PyObject* waf_engine = PyObject_CallObject(waf_load,NULL);
    printf("[C++]waf load pEngine ok :%p\n",waf_engine);
    return waf_engine;
}

int aisec_waf_predict(PyObject* pModule,PyObject* pEngine,const char** pkt_loads)
{
    int i;
    int isAtk = -1;
    int fields_num = 2;
    printf("[C++]waf predicting ....\n");
    printf("[C++]pModule=%p\tpEngine=%p\n",pModule,pEngine);
    PyObject* waf_predict = PyObject_GetAttrString(pModule, "waf_predict");
    if (!waf_predict || !PyCallable_Check(waf_predict))
    {
        printf("[C++] Can't find funftion (waf_predict).\n");
        exit(0);
    }
    PyObject* args = PyTuple_New(2);
    PyObject* arg0 = pEngine;
    PyObject* arg1 = PyList_New(fields_num);
    for(i=0; i<fields_num; i++)
    {
        PyList_SetItem(arg1, i, Py_BuildValue("s", pkt_loads[i]));
        printf("pkt_load = %s\n",pkt_loads[i]);
    }
    
    PyTuple_SetItem(args, 0, arg0);
    PyTuple_SetItem(args, 1, arg1);

    PyObject* waf_predict_info =  PyObject_CallObject(waf_predict,args);

    if (waf_predict_info == NULL)
    {
        printf("[C++] Can't run funftion (waf_predict).\n");
        exit(0);
    }

    PyArg_ParseTuple(waf_predict_info,"i",&isAtk);
    printf("isAtk:   %d\n", isAtk);

    return isAtk;
}

# waf.py by ailx10
import re
import urllib
from sklearn import svm
from sklearn.externals import  joblib

from sklearn import metrics
from sklearn.model_selection import train_test_split


def get_len(url):
    return len(url)

def get_url_count(url):
    if re.search('(http://)|(https://)', url, re.IGNORECASE) :
        return 1
    else:
        return 0

def get_xss_evil_char(url):
    return len(re.findall("[<>,\'\"/]", url, re.IGNORECASE))

def get_xss_evil_word(url):
    return len(re.findall("(alert)|(script=)(%3c)|(%3e)|(%20)|(onerror)|(onload)|(eval)|(src=)|(prompt)",url,re.IGNORECASE))

def get_sql_evil_char(url):
        return len(re.findall("[-,\'\"*/]", url, re.IGNORECASE))

def get_sql_evil_word(url):
    return len(re.findall("(SELECT)|(CASE)|(WHEN)|(ORDER)|(GROUP)|(count)|(%2C%20)|(char)|(NULL)|(AND)",url,re.IGNORECASE))

def get_feature(filename,x,y,atk_index):
        with open(filename) as f:
            for line in f:
                line = line.strip('\n')
                line = urllib.unquote(line)
                f1=get_len(line)
                f2=get_url_count(line)
                f3=get_xss_evil_char(line)
                f4=get_xss_evil_word(line)
                f5=get_sql_evil_char(line)
                f6=get_sql_evil_word(line)
                x.append([f1,f2,f3,f4,f5,f6])
                y.append(atk_index)


def do_metrics(y_test,y_pred):
    print "metrics.accuracy_score:"
    print metrics.accuracy_score(y_test, y_pred)
    print "metrics.confusion_matrix:"
    print metrics.confusion_matrix(y_test, y_pred)
    print "metrics.recall_score:"
    print metrics.recall_score(y_test, y_pred,average=None)
    print("saved!")

def train():
    x = []
    y = []
    get_feature('./data/sql-10000.txt',x,y,2)
    get_feature('./data/xss-200000.txt',x,y,1)
    get_feature('./data/good-10000.txt',x,y,0)

    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=0)
    clf = svm.SVC(kernel='linear').fit(x_train, y_train)

    y_pred = clf.predict(x_test)
    joblib.dump(clf,"./model/waf.pkl")
    do_metrics(y_test, y_pred)


def get_pkt_feature(pkts):
    x = []
    for pkt in pkts:
        pkt = urllib.unquote(pkt)
        f1=get_len(pkt)
        f2=get_url_count(pkt)
        f3=get_xss_evil_char(pkt)
        f4=get_xss_evil_word(pkt)
        f5=get_sql_evil_char(pkt)
        f6=get_sql_evil_word(pkt)
        x.append([f1,f2,f3,f4,f5,f6])
    return x


def waf_load():
    clf=joblib.load("/home/nginx/aisecurity-0.2/model/waf.pkl")
    return clf

def waf_predict(clf,pkts):
    atk_flag = 0
    pkts = get_pkt_feature(pkts)
    atk_predicts = clf.predict(pkts)
    for atk_predict in atk_predicts:
        if atk_predict > 0:
            atk_flag = 1
            break
    return (atk_flag,)

def test(clf):
    x1 = "i love web security!"
    x2 = "<script>alert(1)</script>"
    x3 = "-2' union select group_concat(Username),2,3 from Person"
    x = [x1,x2,x3]
    pkts = get_pkt_feature(x)
    print(clf.predict(pkts))

if __name__ == "__main__":
    train()
    clf = waf_load()
    test(clf)

Logo

DAMO开发者矩阵,由阿里巴巴达摩院和中国互联网协会联合发起,致力于探讨最前沿的技术趋势与应用成果,搭建高质量的交流与分享平台,推动技术创新与产业应用链接,围绕“人工智能与新型计算”构建开放共享的开发者生态。

更多推荐