1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
|
//===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
///
/// /file
/// This file defines classes for searching and anlyzing source code clones.
///
//===----------------------------------------------------------------------===//
#ifndef LLVM_CLANG_AST_CLONEDETECTION_H
#define LLVM_CLANG_AST_CLONEDETECTION_H
#include "clang/Basic/SourceLocation.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/StringMap.h"
#include <vector>
namespace clang {
class Stmt;
class Decl;
class VarDecl;
class ASTContext;
class CompoundStmt;
/// \brief Identifies a list of statements.
///
/// Can either identify a single arbitrary Stmt object, a continuous sequence of
/// child statements inside a CompoundStmt or no statements at all.
class StmtSequence {
/// If this object identifies a sequence of statements inside a CompoundStmt,
/// S points to this CompoundStmt. If this object only identifies a single
/// Stmt, then S is a pointer to this Stmt.
const Stmt *S;
/// The related ASTContext for S.
ASTContext *Context;
/// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence
/// instance is representing the CompoundStmt children inside the array
/// [StartIndex, EndIndex).
unsigned StartIndex;
unsigned EndIndex;
public:
/// \brief Constructs a StmtSequence holding multiple statements.
///
/// The resulting StmtSequence identifies a continuous sequence of statements
/// in the body of the given CompoundStmt. Which statements of the body should
/// be identified needs to be specified by providing a start and end index
/// that describe a non-empty sub-array in the body of the given CompoundStmt.
///
/// \param Stmt A CompoundStmt that contains all statements in its body.
/// \param Context The ASTContext for the given CompoundStmt.
/// \param StartIndex The inclusive start index in the children array of
/// \p Stmt
/// \param EndIndex The exclusive end index in the children array of \p Stmt.
StmtSequence(const CompoundStmt *Stmt, ASTContext &Context,
unsigned StartIndex, unsigned EndIndex);
/// \brief Constructs a StmtSequence holding a single statement.
///
/// \param Stmt An arbitrary Stmt.
/// \param Context The ASTContext for the given Stmt.
StmtSequence(const Stmt *Stmt, ASTContext &Context);
/// \brief Constructs an empty StmtSequence.
StmtSequence();
typedef const Stmt *const *iterator;
/// Returns an iterator pointing to the first statement in this sequence.
iterator begin() const;
/// Returns an iterator pointing behind the last statement in this sequence.
iterator end() const;
/// Returns the first statement in this sequence.
///
/// This method should only be called on a non-empty StmtSequence object.
const Stmt *front() const {
assert(!empty());
return begin()[0];
}
/// Returns the last statement in this sequence.
///
/// This method should only be called on a non-empty StmtSequence object.
const Stmt *back() const {
assert(!empty());
return begin()[size() - 1];
}
/// Returns the number of statements this object holds.
unsigned size() const {
if (holdsSequence())
return EndIndex - StartIndex;
if (S == nullptr)
return 0;
return 1;
}
/// Returns true if and only if this StmtSequence contains no statements.
bool empty() const { return size() == 0; }
/// Returns the related ASTContext for the stored Stmts.
ASTContext &getASTContext() const {
assert(Context);
return *Context;
}
/// Returns true if this objects holds a list of statements.
bool holdsSequence() const { return EndIndex != 0; }
/// Returns the start sourcelocation of the first statement in this sequence.
///
/// This method should only be called on a non-empty StmtSequence object.
SourceLocation getStartLoc() const;
/// Returns the end sourcelocation of the last statement in this sequence.
///
/// This method should only be called on a non-empty StmtSequence object.
SourceLocation getEndLoc() const;
/// Returns the source range of the whole sequence - from the beginning
/// of the first statement to the end of the last statement.
SourceRange getSourceRange() const;
bool operator==(const StmtSequence &Other) const {
return std::tie(S, StartIndex, EndIndex) ==
std::tie(Other.S, Other.StartIndex, Other.EndIndex);
}
bool operator!=(const StmtSequence &Other) const {
return std::tie(S, StartIndex, EndIndex) !=
std::tie(Other.S, Other.StartIndex, Other.EndIndex);
}
/// Returns true if and only if this sequence covers a source range that
/// contains the source range of the given sequence \p Other.
///
/// This method should only be called on a non-empty StmtSequence object
/// and passed a non-empty StmtSequence object.
bool contains(const StmtSequence &Other) const;
};
/// \brief Searches for clones in source code.
///
/// First, this class needs a translation unit which is passed via
/// \p analyzeTranslationUnit . It will then generate and store search data
/// for all statements inside the given translation unit.
/// Afterwards the generated data can be used to find code clones by calling
/// \p findClones .
///
/// This class only searches for clones in exectuable source code
/// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
/// are not supported.
class CloneDetector {
public:
typedef unsigned DataPiece;
/// Holds the data about a StmtSequence that is needed during the search for
/// code clones.
struct CloneSignature {
/// \brief The hash code of the StmtSequence.
///
/// The initial clone groups that are formed during the search for clones
/// consist only of Sequences that share the same hash code. This makes this
/// value the central part of this heuristic that is needed to find clones
/// in a performant way. For this to work, the type of this variable
/// always needs to be small and fast to compare.
///
/// Also, StmtSequences that are clones of each others have to share
/// the same hash code. StmtSequences that are not clones of each other
/// shouldn't share the same hash code, but if they do, it will only
/// degrade the performance of the hash search but doesn't influence
/// the correctness of the result.
size_t Hash;
/// \brief The complexity of the StmtSequence.
///
/// This value gives an approximation on how many direct or indirect child
/// statements are contained in the related StmtSequence. In general, the
/// greater this value, the greater the amount of statements. However, this
/// is only an approximation and the actual amount of statements can be
/// higher or lower than this value. Statements that are generated by the
/// compiler (e.g. macro expansions) for example barely influence the
/// complexity value.
///
/// The main purpose of this value is to filter clones that are too small
/// and therefore probably not interesting enough for the user.
unsigned Complexity;
/// \brief Creates an empty CloneSignature without any data.
CloneSignature() : Complexity(1) {}
CloneSignature(llvm::hash_code Hash, unsigned Complexity)
: Hash(Hash), Complexity(Complexity) {}
};
/// Holds group of StmtSequences that are clones of each other and the
/// complexity value (see CloneSignature::Complexity) that all stored
/// StmtSequences have in common.
struct CloneGroup {
std::vector<StmtSequence> Sequences;
CloneSignature Signature;
CloneGroup() {}
CloneGroup(const StmtSequence &Seq, CloneSignature Signature)
: Signature(Signature) {
Sequences.push_back(Seq);
}
/// \brief Returns false if and only if this group should be skipped when
/// searching for clones.
bool isValid() const {
// A clone group with only one member makes no sense, so we skip them.
return Sequences.size() > 1;
}
};
/// \brief Generates and stores search data for all statements in the body of
/// the given Decl.
void analyzeCodeBody(const Decl *D);
/// \brief Stores the CloneSignature to allow future querying.
void add(const StmtSequence &S, const CloneSignature &Signature);
/// \brief Searches the provided statements for clones.
///
/// \param Result Output parameter that is filled with a list of found
/// clone groups. Each group contains multiple StmtSequences
/// that were identified to be clones of each other.
/// \param MinGroupComplexity Only return clones which have at least this
/// complexity value.
/// \param CheckPatterns Returns only clone groups in which the referenced
/// variables follow the same pattern.
void findClones(std::vector<CloneGroup> &Result, unsigned MinGroupComplexity,
bool CheckPatterns = true);
/// \brief Describes two clones that reference their variables in a different
/// pattern which could indicate a programming error.
struct SuspiciousClonePair {
/// \brief Utility class holding the relevant information about a single
/// clone in this pair.
struct SuspiciousCloneInfo {
/// The variable which referencing in this clone was against the pattern.
const VarDecl *Variable;
/// Where the variable was referenced.
const Stmt *Mention;
/// The variable that should have been referenced to follow the pattern.
/// If Suggestion is a nullptr then it's not possible to fix the pattern
/// by referencing a different variable in this clone.
const VarDecl *Suggestion;
SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention,
const VarDecl *Suggestion)
: Variable(Variable), Mention(Mention), Suggestion(Suggestion) {}
SuspiciousCloneInfo() {}
};
/// The first clone in the pair which always has a suggested variable.
SuspiciousCloneInfo FirstCloneInfo;
/// This other clone in the pair which can have a suggested variable.
SuspiciousCloneInfo SecondCloneInfo;
};
/// \brief Searches the provided statements for pairs of clones that don't
/// follow the same pattern when referencing variables.
/// \param Result Output parameter that will contain the clone pairs.
/// \param MinGroupComplexity Only clone pairs in which the clones have at
/// least this complexity value.
void findSuspiciousClones(std::vector<SuspiciousClonePair> &Result,
unsigned MinGroupComplexity);
private:
/// Stores all encountered StmtSequences alongside their CloneSignature.
std::vector<std::pair<CloneSignature, StmtSequence>> Sequences;
};
} // end namespace clang
#endif // LLVM_CLANG_AST_CLONEDETECTION_H
|