메뉴 건너뛰기

Hello :0

소스코드 및 내용 가져온곳

http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/

https://github.com/dennybritz/cnn-text-classification-tf

 

텍스트 마이닝을하는데 CNN이 강력하다는 소리를 듣고 검색을 해서 우째 우쨰 적절한 소스를 구했다. 여기서 데이터 입력 부분만 살짝 손을 봐주었다

전체 코드는 아래와 같다.

import pymongo
import tensorflow as tf
import numpy as np
import os
import time
import datetime
import data_helpers
from text_cnn import TextCNN
from tensorflow.contrib import learn

connection = pymongo.MongoClient("mongodb://localhost")
db = connection.malware_zoo
learning_data = db.learning_data


# Parameters
# ==================================================

# Model Hyperparameters
tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)")
tf.flags.DEFINE_string("filter_sizes", "2,3,4", "Comma-separated filter sizes (default: '3,4,5')")
tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)")
tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularizaion lambda (default: 0.0)")

# Training parameters
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
tf.flags.DEFINE_integer("num_epochs", 200, "Number of training epochs (default: 200)")
tf.flags.DEFINE_integer("evaluate_every", 20, "Evaluate model on dev set after this many steps (default: 100)")
tf.flags.DEFINE_integer("checkpoint_every", 200, "Save model after this many steps (default: 100)")
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")


# Data Preparatopn
# ==================================================
# Load data


def get_sample_data():
    text= []
    label = []
    r_label = []
    detect_and_num ={}
    zoo_data = learning_data.find()
    for line in zoo_data:
        text_val = line["opcode"].keys() + line["ie_api"] + line["section_info"].keys()
        detect_val = line['detect'].split(".")[0]
        #detect_val = line['detect']
        text.append(" ".join(str(e) for e in text_val))

        if len(detect_and_num) == 0:
            detect_and_num[detect_val] = 1

        if detect_val not in detect_and_num.keys():
            current_max = detect_and_num[max(detect_and_num, key=detect_and_num.get)]
            detect_and_num[detect_val] = current_max+1

        label.append(detect_and_num[detect_val])

    max_num = max(label)
    for l_num in label:
        lst = [ 0 for _ in range(max_num)]
        lst[-l_num] = 1
        r_label.append(lst)

    r_label = np.array(r_label)
    return text, r_label, detect_and_num

print("Loading data...")
x_text, y, dict_z = get_sample_data()


# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
cut = int(len(x_shuffled) * 0.90)

x_train, x_dev = x_shuffled[:cut], x_shuffled[cut:]
y_train, y_dev = y_shuffled[:cut], y_shuffled[cut:]
print("Data Size: {:d}".format(len(x_shuffled)))
print("Label Size: {:d}".format(len(dict_z)))
print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
#data set make end




# Training
# ==================================================

with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        cnn = TextCNN(
            sequence_length=x_train.shape[1],
            num_classes=y_train.shape[1],
            vocab_size=len(vocab_processor.vocabulary_),
            embedding_size=FLAGS.embedding_dim,
            filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
            num_filters=FLAGS.num_filters,
            l2_reg_lambda=FLAGS.l2_reg_lambda)

        # Define Training procedure
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(1e-3)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

        # Keep track of gradient values and sparsity (optional)
        grad_summaries = []
        for g, v in grads_and_vars:
            if g is not None:
                grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g)
                sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                grad_summaries.append(grad_hist_summary)
                grad_summaries.append(sparsity_summary)
        grad_summaries_merged = tf.merge_summary(grad_summaries)

        # Output directory for models and summaries
        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
        print("Writing to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.scalar_summary("loss", cnn.loss)
        acc_summary = tf.scalar_summary("accuracy", cnn.accuracy)

        # Train Summaries
        train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged])
        train_summary_dir = os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph)

        # Dev summaries
        dev_summary_op = tf.merge_summary([loss_summary, acc_summary])
        dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph)

        # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.all_variables())

        # Write vocabulary
        vocab_processor.save(os.path.join(out_dir, "vocab"))

        # Initialize all variables
        sess.run(tf.initialize_all_variables())

        def train_step(x_batch, y_batch):
            """
            A single training step
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
            }
            _, step, summaries, loss, accuracy = sess.run(
                [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            train_summary_writer.add_summary(summaries, step)

        def dev_step(x_batch, y_batch, writer=None):
            """
            Evaluates model on a dev set
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: 1.0
            }
            step, summaries, loss, accuracy, predictions = sess.run(
                [global_step, dev_summary_op, cnn.loss, cnn.accuracy, cnn.predictions],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))

            #compare data
            max_dict_z = dict_z[max(dict_z, key=lambda i: dict_z[i])]
            predict_list=[]
            true_list=[]
            o_x_list = []
            for one_predict in predictions:
                predict_list.append(dict_z.keys()[dict_z.values().index(max_dict_z - one_predict)])

            for one_batch in y_batch:
                true_list.append(dict_z.keys()[dict_z.values().index(max_dict_z - one_batch.tolist().index(1))])

            for num in range(len(predict_list)):
                if predict_list[num] == true_list[num]:
                    o_x_list.append("O")
                else:
                    o_x_list.append("X")

            print "[+] Ahnlab-V3 Detection [+]"
            print true_list
            print ""
            print "[+] Leekyu Machine Detection [+]"
            print predict_list
            print ""
            print o_x_list

            if writer:
                writer.add_summary(summaries, step)


        # Generate batches
        batches = data_helpers.batch_iter(
            list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)
        # Training loop. For each batch...
        for batch in batches:
            x_batch, y_batch = zip(*batch)
            train_step(x_batch, y_batch)
            current_step = tf.train.global_step(sess, global_step)
            if current_step % FLAGS.evaluate_every == 0:
                print("\nEvaluation:")
                dev_step(x_dev, y_dev, writer=dev_summary_writer)
                print("")
            if current_step % FLAGS.checkpoint_every == 0:
                path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                print("Saved model checkpoint to {}\n".format(path))


opcode에서 문제점이 있는게 전체 개수를 고려하지 않은체. 하나의 opcode를 대상으로 해서 성능이 제대로 안나오는 것으로 보인다.

해당 코드를 돌려보면 거의 83~87정도의 정확도를 보인다.

전체데이터를 셔플한뒤 90프로는 테스트셋 10를 평가한다.

 

전체 진단명을 쓸경우 출력의 개수가 너무 많아지기에 원래 진단명이 Malware/Win32.Generic.C1597435 라면 Malware/Win32 요까지만 되도록 한결과 23개 의 분류로 구분할 수 있다.

2.png

실제탐지명

머신러닝 예측 탐지명

비교 결과 

doc2vec이나 opcode 개수에 대한 정보로 할경우 더 정확해질것으로 보인다.

 

============================================================

 

10.png

"filter_sizes", "2,3,4" -> "filter_sizes", "2, ..... 3,4" 늘리면서 정확도가 상당히 향상