메뉴 건너뛰기

Hello :0

# coding: utf-8

# In[1]:

#데이터 만들고
import pymongo
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing

connection = pymongo.MongoClient("mongodb://192.168.1.4")
db = connection.malware_zoo
learning_data = db.learning_data

zoo_data = learning_data.find()

process_info_list=[]
label = []
r_label = []
detect_and_num ={}


for line in zoo_data:
    api_list = list(set(line["ie_api"]))
    for api in api_list:
        line["opcode"][api]=1
    line["opcode"].update(line["section_info"])
    process_info_list.append(line["opcode"])

    detect_val = line['detect'].split("/")[0]
    #detect_val = line['detect']

    if len(detect_and_num) == 0:
        detect_and_num[detect_val] = 1

    if detect_val not in detect_and_num.keys():
        current_max = detect_and_num[max(detect_and_num, key=detect_and_num.get)]
        detect_and_num[detect_val] = current_max+1

    label.append(detect_and_num[detect_val])

max_num = max(label)
for l_num in label:
    lst = [ 0 for _ in range(max_num)]
    lst[-l_num] = 1
    r_label.append(lst)

r_label = np.array(r_label)  


v = DictVectorizer(sparse=False)
X = v.fit_transform(process_info_list)

X_normalized = preprocessing.normalize(X)  
len(X_normalized[0])


# In[2]:

#차원축소
from sklearn import decomposition

pca = decomposition.PCA(n_components=5000)
pca.fit(X_normalized)
x_vector = pca.transform(X_normalized)
x_vector = X_normalized


# In[3]:

#라벨
print label


# In[4]:

#라벨 벡터
print r_label


# In[5]:

#리스트
print detect_and_num


# In[6]:

import tensorflow as tf

x_data = x_vector
y_data = r_label

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y_data)))
x_shuffled = x_data[shuffle_indices]
y_shuffled = y_data[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
cut = int(len(x_shuffled) * 0.90)

x_train, x_dev = x_shuffled[:cut], x_shuffled[cut:]
y_train, y_dev = y_shuffled[:cut], y_shuffled[cut:]


X = tf.placeholder("float", [None, len(x_train[0])])
Y = tf.placeholder("float", [None, len(y_train[0])])

W = tf.Variable(tf.zeros([len(x_data[0]), len(y_data[0])]))

hypothesis = tf.nn.softmax(tf.matmul(X, W))

learning_rate = 0.4

cost = tf.reduce_mean(-tf.reduce_sum(Y * tf.log(hypothesis), reduction_indices=1))

optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)


init = tf.initialize_all_variables()

max_dict_z = detect_and_num[max(detect_and_num, key=lambda i: detect_and_num[i])]
predict_list=[]
true_list=[]
o_x_list = []

with tf.Session() as sess:
    sess.run(init)
    
    for step in xrange(30001):
        predict_list=[]
        true_list=[]
        o_x_list = []
        
        sess.run(optimizer, feed_dict={X: x_train, Y: y_train})
        if step % 3000 == 0:
            
            sess.run(cost, feed_dict={X: x_train, Y: y_train}), sess.run(W)     
            for val in range(len(x_dev)):
                #print x_dev[val] 
                a = sess.run(hypothesis, feed_dict={X: [x_dev[val]]})
                predict_val = detect_and_num.keys()[detect_and_num.values().index(max_dict_z - sess.run(tf.arg_max(a, 1)))]
                true_val = detect_and_num.keys()[detect_and_num.values().index(max_dict_z - y_dev[val].tolist().index(1))]
                predict_list.append(predict_val)
                true_list.append(true_val)
            for num in range(len(predict_list)):
                if predict_list[num] == true_list[num]:
                    o_x_list.append("O")
                else:
                    o_x_list.append("X")
            print "Step ", step
            print "[+] Ahnlab-V3 Detection [+]"
            print true_list
            print ""
            print "[+] Leekyu Machine Detection [+]"
            print predict_list
            print ""
            print o_x_list
            print ""
            print float(o_x_list.count("O"))/float(len(o_x_list)), "%"
                
print "Test End"