390 likes | 529 Vues
This research introduces an approach for detecting code clones through memory comparison techniques. By analyzing syntactically or semantically similar code fragments, we aim to enhance applications in software refactoring, bug detection, and understanding software evolution. We highlight the limitations of existing clone detectors, like CCFinder and DECKARD, particularly in identifying semantic clones, and propose methods involving procedural effects and intermediate variable analysis for more effective clone detection. This work makes strides in identifying malicious code duplication while improving software maintenance.
E N D
MeCC: Memory Comparison based Clone Detector Heejung Kim1, Yungbum Jung1, Sunghun Kim2, and Kwangkeun Yi1 1 Seoul National University 2 The Hong Kong University of Science and Technology By Choi Yong suk
Code Clones static PyObject * float_mul(PyObject *v, PyObject *w) { double a,b; CONVERT_TO_DOUBLE(v,a); CONVERT_TO_DOUBLE(w,b); PyFPE_START_PROTECT(“multiply”,return 0) a = a * b; PyFPE_END_PROTECT(a) return PyFloat_FromDouble(a); } static PyObject * float_add(PyObject *v, PyObject *w) { double a,b; CONVERT_TO_DOUBLE(v,a); CONVERT_TO_DOUBLE(w,b); PyFPE_START_PROTECT(“add”,return 0) a = a + b; PyFPE_END_PROTECT(a) return PyFloat_FromDouble(a); } • similar code fragments • (syntactically or semantically)
Applications of Code Clones analysis software refactoring detecting potential bugs understanding software evolution detecting software plagiarism (malicious duplication)
Clone Detectors • CCFinder [TSE’02] • textual tokens • DECKARD [ICSE’07] • AST characteristic vectors • PDG-based [ICSE‘08, SAS’01] • program dependence graph - Effective for syntactic code clones - limited for semantic code clones
Three code clones missed by syntax-based clone detection
Control Replacement (1) PyObject *PyBool_FromLong (long ok) { PyObject *result; if (ok) result = Py_True; else result = Py_False; Py_INCREF(result); return result; } static PyObject *get_pybool (int istrue) { PyObject *result = istrue? Py_True: Py_False; Py_INCREF(result); return result; } syntactically different but semantically identical
Capturing Procedural Effects (2) void appendPQExpBufferChar (PQExpBuffer str, char ch) { /* Make more room if needed *. if (!enlargePQExpBuffer(str, 1)) return; /* OK, append the data */ str->data[str->len] = ch; str->len++; str->data[str->len] = ‘\0’; } void appendBinaryPQExpBuffer (PQExpBuffer str, const char* data, size_t datalen) { /* Make more room if needed *. if (!enlargePQExpBuffer(str, datalen)) return; /* OK, append the data */ memcpy(str->data + str->len, data, datalen); str->len+= datalen; str->data[str->len] = ‘\0’; } understanding memory behavior of procedures
More Complex Clone (3) ... *set_access_name(cmd_parms *cmd, void *dummy, const char *arg){ void *sconf = cmd->server->module_config; core_server_config *conf = ap_get_module_config(sconf, &core_module); const char *err = ap_check_cmd_context(sconf,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); if (err != NULL) { return err; } conf->access_name = apr_pstrdup(cmd->pool,arg); return NULL; } ... *set_protocol(cmd_parms *cmd, void *dummy, const char *arg){ const char *err = ap_check_cmd_context(cmd,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); core_server_config *conf = ap_get_module_config(cmd->server->module_config, &core_module); char *proto; if (err != NULL) { return err; } proto = apr_pstrdup(cmd->pool,arg); ap_str_tolower(proto); conf->protocol = proto; return NULL; }
… *set_access_name(cmd_parms *cmd, void *dummy, const char *arg){ void *sconf = cmd->server->module_config; core_server_config *conf = ap_get_module_config(sconf, &core_module); const char *err = ap_check_cmd_context(sconf,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); if (err != NULL) { return err; } conf->access_name = apr_pstrdup(cmd->pool,arg); return NULL; } ... *set_protocol(cmd_parms *cmd, void *dummy, const char *arg){ const char *err = ap_check_cmd_context(cmd,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); core_server_config *conf = ap_get_module_config(cmd->server->module_config, &core_module); char *proto; if (err != NULL) { return err; } proto = apr_pstrdup(cmd->pool,arg); ap_str_tolower(proto); conf->protocol = proto; return NULL; } statement reordering
… *set_access_name(cmd_parms *cmd, void *dummy, const char *arg){ void *sconf = cmd->server->module_config; core_server_config *conf = ap_get_module_config(sconf, &core_module); const char *err = ap_check_cmd_context(sconf,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); if (err != NULL) { return err; } conf->access_name = apr_pstrdup(cmd->pool,arg); return NULL; } intermediate variables ... *set_protocol(cmd_parms *cmd, void *dummy, const char *arg){ const char *err = ap_check_cmd_context(cmd,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); core_server_config *conf = ap_get_module_config(cmd->server->module_config, &core_module); char *proto; if (err != NULL) { return err; } proto = apr_pstrdup(cmd->pool,arg); ap_str_tolower(proto); conf->protocol = proto; return NULL; } statement reordering
… *set_access_name(cmd_parms *cmd, void *dummy, const char *arg){ void *sconf = cmd->server->module_config; core_server_config *conf = ap_get_module_config(sconf, &core_module); const char *err = ap_check_cmd_context(sconf,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); if (err != NULL) { return err; } conf->access_name = apr_pstrdup(cmd->pool,arg); return NULL; } intermediate variables statement splitting ... *set_protocol(cmd_parms *cmd, void *dummy, const char *arg){ const char *err = ap_check_cmd_context(cmd,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); core_server_config *conf = ap_get_module_config(cmd->server->module_config, &core_module); char *proto; if (err != NULL) { return err; } proto = apr_pstrdup(cmd->pool,arg); ap_str_tolower(proto); conf->protocol = proto; return NULL; } statement reordering
… *set_access_name(cmd_parms *cmd, void *dummy, const char *arg){ void *sconf = cmd->server->module_config; core_server_config *conf = ap_get_module_config(sconf, &core_module); const char *err = ap_check_cmd_context(sconf,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); if (err != NULL) { return err; } conf->access_name = apr_pstrdup(cmd->pool,arg); return NULL; } intermediate variables statement splitting ... *set_protocol(cmd_parms *cmd, void *dummy, const char *arg){ const char *err = ap_check_cmd_context(cmd,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); core_server_config *conf = ap_get_module_config(cmd->server->module_config, &core_module); char *proto; if (err != NULL) { return err; } proto = apr_pstrdup(cmd->pool,arg); ap_str_tolower(proto); conf->protocol = proto; return NULL; } statement reordering
These Semantic Clones are Identified by MeCC
MeCC: Approach Static analyzer estimates the semantics of programs Abstract memories are results of analysis Comparing abstract memories is a measure
Clone Detection Process procedures abstract memories Static Analyzer program Comparing Memories Code Clones similarities
Estimating Semantics by Abstract Memories (guarded value ) int make (list *a, int count){ int r = count + 1; if (a!=0){ a->next = malloc(...); a->next->val = count; } else { return r - 1; } return r; } {(guard , symbolic value)} (finite mapping) Estimating an abstract memory at the procedure’s exit point Abstract memory is a map from abstract addresses to abstract values
Estimating Semantics by Abstract Memories int make (list *a, int count){ int r = count + 1; if (a!=0){ a->next = malloc(...); a->next->val = count; } else { return r - 1; } return r; } • The abstract memory state • All abstract values are guarded by execution path conditions Use symbols for unknown input values
Estimating Semantics by Abstract Memories int make (list *a, int count){ int r = count + 1; if (a!=0){ a->next = malloc(...); a->next->val = count; } else { return r - 1; } return r; } • The abstract memory state • All abstract values are guarded by execution path conditions Use symbols for unknown input values
Estimating Semantics by Abstract Memories int make (list *a, int count){ int r = count + 1; if (a!=0){ a->next = malloc(...); a->next->val = count; } else { return r - 1; } return r; } int make (list *a, int count){ int r = count + 1; if (a!=0){ a->next = malloc(...); a->next->val = count; } else { return r - 1; } return r; } copy and modify int make2 (list2 *a, int b){ if (a==0) return b; a->n = malloc(...); a->n->v = b; return b + 2; }
Clone Detection Process procedures abstract memories Static Analyzer program Comparing Memories Code Clones similarities
Subject Projects Table 5: Time spent for the detection process.
CLONE TYPES Type-1 (Exact clones): Identical code fragments except for variations in whitespace, layout, and comments. Type-2 (Renamed clones): Syntactically identical fragments except for variations in identiers, literals, and variable types in addition to Type-1's variations. Type-3 (Gapped clones): Copied fragments with further modications such as changed, added, or deleted statements in addition to Type-2's variations. Type-4 (Semantic clones): Code fragments that perform similar functionality but are implemented by different syntactic variants.
Detected Clones & Semantic Clones 45% !! C. K. Roy and J. R. Cordy. A survey on software clone detection research. SCHOOL OF COMPUTING TR 2007-541, QUEEN’S UNIVERSITY, 115, 2007. Table 2: The distribution of detected clone types by MeCC.
Comparison Table 6: The numbers of detected Type-3 and Type-4 clones by MeCC, Deckard, CCFinder, and a PDG-based detector [9].
Applications of Code Clones analysis software refactoring detecting potential bugs understanding software evolution detecting software plagiarism (malicious duplication)
Finding Potential Bugs A large portion of semantic clones are due to inconsistent changes Inconsistent changes may lead to potential bugs (inconsistent clones) Two semantic clones with potential bugs
const char *GetVariable (VariableSpace space, const char *name) { struct_variable *current; if (!space) return NULL; for (current=space->next;current;current=current->next) { if (strcmp(current->name,name) == 0) { return current->value; } } return NULL; } const char *PQparameterStatus (const PGconn *conn, const char *paramName) { const pgParameterStatus *pstatus; if (!conn || !paramName) return NULL; for (pstatus=conn->pstatus; pstatus!=NULL; pstatus = pstatus->next) { if (strcmp(pstatus->name,paramName)== 0) return pstatus->value; } return NULL; } parameter name also should be checked! Missed Null Check
PyObject *pwd_getpwall (PyObject *self) { PyObject *d; struct passwd *p; if ((d = PyList_New(0)) == NULL) return NULL; setpwent(); while ((p = getpwent()) != NULL) { PyObject *v = mkpwent(p); if (v==NULL || PyList_Append(d,v)!=0) { Py_XDECREF(v); Py_DECREF(d); return NULL; } Py_DECREF(v); } endpwent(); return d; } open user database close user database A resource leak without endpwent() procedure call A Resource Leak Bug (Python project revision #20157)
PyObject *spwd_getspall (PyObject *self, PyObject *args) { PyObject *d; struct spwd *p; if ((d = PyList_New(0)) == NULL) return NULL; setspent(); while ((p = getspent()) != NULL) { PyObject *v = mkspent(p); if (v==NULL || PyList_Append(d,v)!=0) { Py_XDECREF(v); Py_DECREF(d); endspent(); return NULL; } Py_DECREF(v); } endspent(); return d; } PyObject *pwd_getpwall (PyObject *self) { PyObject *d; struct passwd *p; if ((d = PyList_New(0)) == NULL) return NULL; setpwent(); while ((p = getpwent()) != NULL) { PyObject *v = mkpwent(p); if (v==NULL || PyList_Append(d,v)!=0) { Py_XDECREF(v); Py_DECREF(d); return NULL; } Py_DECREF(v); } endpwent(); return d; } A Bug-free Procedure (Python project revision #38359)
PyObject *spwd_getspall (PyObject *self, PyObject *args) { PyObject *d; struct spwd *p; if ((d = PyList_New(0)) == NULL) return NULL; setspent(); while ((p = getspent()) != NULL) { PyObject *v = mkspent(p); if (v==NULL || PyList_Append(d,v)!=0) { Py_XDECREF(v); Py_DECREF(d); endspent(); return NULL; } Py_DECREF(v); } endspent(); return d; } PyObject *pwd_getpwall (PyObject *self) { PyObject *d; struct passwd *p; if ((d = PyList_New(0)) == NULL) return NULL; setpwent(); while ((p = getpwent()) != NULL) { PyObject *v = mkpwent(p); if (v==NULL || PyList_Append(d,v)!=0) { Py_XDECREF(v); Py_DECREF(d); endpwent(); return NULL; } Py_DECREF(v); } endpwent(); return d; } Bug fixed The Bug is Fixed Later(Python project revision #73017)
Procedure A was created with a resource leak Procedure B (a code clone of A) is introduced without resource leaks The resource leak bug in procedure A is fixed revision #20157 revision #38359 4 years the resource leak can be fixed if MeCC were applied revision #73017
Potential Bugs and Code Smells #Semantic Clones Potential Bugs (%) Code Smells (%) detected by MeCC Table 7: Exploitable bugs and code smells in Type-3 and Type-4 clones found by MeCC.
Conclusion MeCC: Memory Comparison-based Clone Detector a new clone detector using semantics-based static analysis tolerant to syntactic variations can be used to find potential bugs