1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
| //===------------------------- LSUnit.h --------------------------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
///
/// A Load/Store unit class that models load/store queues and that implements
/// a simple weak memory consistency model.
///
//===----------------------------------------------------------------------===//
#ifndef LLVM_MCA_LSUNIT_H
#define LLVM_MCA_LSUNIT_H
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/MC/MCSchedule.h"
#include "llvm/MCA/HardwareUnits/HardwareUnit.h"
#include "llvm/MCA/Instruction.h"
namespace llvm {
namespace mca {
class Scheduler;
/// A node of a memory dependency graph. A MemoryGroup describes a set of
/// instructions with same memory dependencies.
///
/// By construction, instructions of a MemoryGroup don't depend on each other.
/// At dispatch stage, instructions are mapped by the LSUnit to MemoryGroups.
/// A Memory group identifier is then stored as a "token" in field
/// Instruction::LSUTokenID of each dispatched instructions. That token is used
/// internally by the LSUnit to track memory dependencies.
class MemoryGroup {
unsigned NumPredecessors;
unsigned NumExecutingPredecessors;
unsigned NumExecutedPredecessors;
unsigned NumInstructions;
unsigned NumExecuting;
unsigned NumExecuted;
SmallVector<MemoryGroup *, 4> Succ;
CriticalDependency CriticalPredecessor;
InstRef CriticalMemoryInstruction;
MemoryGroup(const MemoryGroup &) = delete;
MemoryGroup &operator=(const MemoryGroup &) = delete;
public:
MemoryGroup()
: NumPredecessors(0), NumExecutingPredecessors(0),
NumExecutedPredecessors(0), NumInstructions(0), NumExecuting(0),
NumExecuted(0), CriticalPredecessor(), CriticalMemoryInstruction() {}
MemoryGroup(MemoryGroup &&) = default;
ArrayRef<MemoryGroup *> getSuccessors() const { return Succ; }
unsigned getNumSuccessors() const { return Succ.size(); }
unsigned getNumPredecessors() const { return NumPredecessors; }
unsigned getNumExecutingPredecessors() const {
return NumExecutingPredecessors;
}
unsigned getNumExecutedPredecessors() const {
return NumExecutedPredecessors;
}
unsigned getNumInstructions() const { return NumInstructions; }
unsigned getNumExecuting() const { return NumExecuting; }
unsigned getNumExecuted() const { return NumExecuted; }
const InstRef &getCriticalMemoryInstruction() const {
return CriticalMemoryInstruction;
}
const CriticalDependency &getCriticalPredecessor() const {
return CriticalPredecessor;
}
void addSuccessor(MemoryGroup *Group) {
Group->NumPredecessors++;
assert(!isExecuted() && "Should have been removed!");
if (isExecuting())
Group->onGroupIssued(CriticalMemoryInstruction);
Succ.emplace_back(Group);
}
bool isWaiting() const {
return NumPredecessors >
(NumExecutingPredecessors + NumExecutedPredecessors);
}
bool isPending() const {
return NumExecutingPredecessors &&
((NumExecutedPredecessors + NumExecutingPredecessors) ==
NumPredecessors);
}
bool isReady() const { return NumExecutedPredecessors == NumPredecessors; }
bool isExecuting() const {
return NumExecuting && (NumExecuting == (NumInstructions - NumExecuted));
}
bool isExecuted() const { return NumInstructions == NumExecuted; }
void onGroupIssued(const InstRef &IR) {
assert(!isReady() && "Unexpected group-start event!");
NumExecutingPredecessors++;
unsigned Cycles = IR.getInstruction()->getCyclesLeft();
if (CriticalPredecessor.Cycles < Cycles) {
CriticalPredecessor.IID = IR.getSourceIndex();
CriticalPredecessor.Cycles = Cycles;
}
}
void onGroupExecuted() {
assert(!isReady() && "Inconsistent state found!");
NumExecutingPredecessors--;
NumExecutedPredecessors++;
}
void onInstructionIssued(const InstRef &IR) {
assert(!isExecuting() && "Invalid internal state!");
++NumExecuting;
// update the CriticalMemDep.
const Instruction &IS = *IR.getInstruction();
if ((bool)CriticalMemoryInstruction) {
const Instruction &OtherIS = *CriticalMemoryInstruction.getInstruction();
if (OtherIS.getCyclesLeft() < IS.getCyclesLeft())
CriticalMemoryInstruction = IR;
} else {
CriticalMemoryInstruction = IR;
}
if (!isExecuting())
return;
// Notify successors that this group started execution.
for (MemoryGroup *MG : Succ)
MG->onGroupIssued(CriticalMemoryInstruction);
}
void onInstructionExecuted() {
assert(isReady() && !isExecuted() && "Invalid internal state!");
--NumExecuting;
++NumExecuted;
if (!isExecuted())
return;
// Notify successors that this group has finished execution.
for (MemoryGroup *MG : Succ)
MG->onGroupExecuted();
}
void addInstruction() {
assert(!getNumSuccessors() && "Cannot add instructions to this group!");
++NumInstructions;
}
void cycleEvent() {
if (isWaiting() && CriticalPredecessor.Cycles)
CriticalPredecessor.Cycles--;
}
};
/// Abstract base interface for LS (load/store) units in llvm-mca.
class LSUnitBase : public HardwareUnit {
/// Load queue size.
///
/// A value of zero for this field means that the load queue is unbounded.
/// Processor models can declare the size of a load queue via tablegen (see
/// the definition of tablegen class LoadQueue in
/// llvm/Target/TargetSchedule.td).
unsigned LQSize;
/// Load queue size.
///
/// A value of zero for this field means that the store queue is unbounded.
/// Processor models can declare the size of a store queue via tablegen (see
/// the definition of tablegen class StoreQueue in
/// llvm/Target/TargetSchedule.td).
unsigned SQSize;
unsigned UsedLQEntries;
unsigned UsedSQEntries;
/// True if loads don't alias with stores.
///
/// By default, the LS unit assumes that loads and stores don't alias with
/// eachother. If this field is set to false, then loads are always assumed to
/// alias with stores.
const bool NoAlias;
/// Used to map group identifiers to MemoryGroups.
DenseMap<unsigned, std::unique_ptr<MemoryGroup>> Groups;
unsigned NextGroupID;
public:
LSUnitBase(const MCSchedModel &SM, unsigned LoadQueueSize,
unsigned StoreQueueSize, bool AssumeNoAlias);
virtual ~LSUnitBase();
/// Returns the total number of entries in the load queue.
unsigned getLoadQueueSize() const { return LQSize; }
/// Returns the total number of entries in the store queue.
unsigned getStoreQueueSize() const { return SQSize; }
unsigned getUsedLQEntries() const { return UsedLQEntries; }
unsigned getUsedSQEntries() const { return UsedSQEntries; }
void acquireLQSlot() { ++UsedLQEntries; }
void acquireSQSlot() { ++UsedSQEntries; }
void releaseLQSlot() { --UsedLQEntries; }
void releaseSQSlot() { --UsedSQEntries; }
bool assumeNoAlias() const { return NoAlias; }
enum Status {
LSU_AVAILABLE = 0,
LSU_LQUEUE_FULL, // Load Queue unavailable
LSU_SQUEUE_FULL // Store Queue unavailable
};
/// This method checks the availability of the load/store buffers.
///
/// Returns LSU_AVAILABLE if there are enough load/store queue entries to
/// accomodate instruction IR. By default, LSU_AVAILABLE is returned if IR is
/// not a memory operation.
virtual Status isAvailable(const InstRef &IR) const = 0;
/// Allocates LS resources for instruction IR.
///
/// This method assumes that a previous call to `isAvailable(IR)` succeeded
/// with a LSUnitBase::Status value of LSU_AVAILABLE.
/// Returns the GroupID associated with this instruction. That value will be
/// used to set the LSUTokenID field in class Instruction.
virtual unsigned dispatch(const InstRef &IR) = 0;
bool isSQEmpty() const { return !UsedSQEntries; }
bool isLQEmpty() const { return !UsedLQEntries; }
bool isSQFull() const { return SQSize && SQSize == UsedSQEntries; }
bool isLQFull() const { return LQSize && LQSize == UsedLQEntries; }
bool isValidGroupID(unsigned Index) const {
return Index && (Groups.find(Index) != Groups.end());
}
/// Check if a peviously dispatched instruction IR is now ready for execution.
bool isReady(const InstRef &IR) const {
unsigned GroupID = IR.getInstruction()->getLSUTokenID();
const MemoryGroup &Group = getGroup(GroupID);
return Group.isReady();
}
/// Check if instruction IR only depends on memory instructions that are
/// currently executing.
bool isPending(const InstRef &IR) const {
unsigned GroupID = IR.getInstruction()->getLSUTokenID();
const MemoryGroup &Group = getGroup(GroupID);
return Group.isPending();
}
/// Check if instruction IR is still waiting on memory operations, and the
/// wait time is still unknown.
bool isWaiting(const InstRef &IR) const {
unsigned GroupID = IR.getInstruction()->getLSUTokenID();
const MemoryGroup &Group = getGroup(GroupID);
return Group.isWaiting();
}
bool hasDependentUsers(const InstRef &IR) const {
unsigned GroupID = IR.getInstruction()->getLSUTokenID();
const MemoryGroup &Group = getGroup(GroupID);
return !Group.isExecuted() && Group.getNumSuccessors();
}
const MemoryGroup &getGroup(unsigned Index) const {
assert(isValidGroupID(Index) && "Group doesn't exist!");
return *Groups.find(Index)->second;
}
MemoryGroup &getGroup(unsigned Index) {
assert(isValidGroupID(Index) && "Group doesn't exist!");
return *Groups.find(Index)->second;
}
unsigned createMemoryGroup() {
Groups.insert(
std::make_pair(NextGroupID, std::make_unique<MemoryGroup>()));
return NextGroupID++;
}
virtual void onInstructionExecuted(const InstRef &IR);
// Loads are tracked by the LDQ (load queue) from dispatch until completion.
// Stores are tracked by the STQ (store queue) from dispatch until commitment.
// By default we conservatively assume that the LDQ receives a load at
// dispatch. Loads leave the LDQ at retirement stage.
virtual void onInstructionRetired(const InstRef &IR);
virtual void onInstructionIssued(const InstRef &IR) {
unsigned GroupID = IR.getInstruction()->getLSUTokenID();
Groups[GroupID]->onInstructionIssued(IR);
}
virtual void cycleEvent();
#ifndef NDEBUG
void dump() const;
#endif
};
/// Default Load/Store Unit (LS Unit) for simulated processors.
///
/// Each load (or store) consumes one entry in the load (or store) queue.
///
/// Rules are:
/// 1) A younger load is allowed to pass an older load only if there are no
/// stores nor barriers in between the two loads.
/// 2) An younger store is not allowed to pass an older store.
/// 3) A younger store is not allowed to pass an older load.
/// 4) A younger load is allowed to pass an older store only if the load does
/// not alias with the store.
///
/// This class optimistically assumes that loads don't alias store operations.
/// Under this assumption, younger loads are always allowed to pass older
/// stores (this would only affects rule 4).
/// Essentially, this class doesn't perform any sort alias analysis to
/// identify aliasing loads and stores.
///
/// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be
/// set to `false` by the constructor of LSUnit.
///
/// Note that this class doesn't know about the existence of different memory
/// types for memory operations (example: write-through, write-combining, etc.).
/// Derived classes are responsible for implementing that extra knowledge, and
/// provide different sets of rules for loads and stores by overriding method
/// `isReady()`.
/// To emulate a write-combining memory type, rule 2. must be relaxed in a
/// derived class to enable the reordering of non-aliasing store operations.
///
/// No assumptions are made by this class on the size of the store buffer. This
/// class doesn't know how to identify cases where store-to-load forwarding may
/// occur.
///
/// LSUnit doesn't attempt to predict whether a load or store hits or misses
/// the L1 cache. To be more specific, LSUnit doesn't know anything about
/// cache hierarchy and memory types.
/// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the
/// scheduling model provides an "optimistic" load-to-use latency (which usually
/// matches the load-to-use latency for when there is a hit in the L1D).
/// Derived classes may expand this knowledge.
///
/// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
/// memory-barrier like instructions.
/// LSUnit conservatively assumes that an instruction which `mayLoad` and has
/// `unmodeled side effects` behave like a "soft" load-barrier. That means, it
/// serializes loads without forcing a flush of the load queue.
/// Similarly, instructions that both `mayStore` and have `unmodeled side
/// effects` are treated like store barriers. A full memory
/// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side
/// effects. This is obviously inaccurate, but this is the best that we can do
/// at the moment.
///
/// Each load/store barrier consumes one entry in the load/store queue. A
/// load/store barrier enforces ordering of loads/stores:
/// - A younger load cannot pass a load barrier.
/// - A younger store cannot pass a store barrier.
///
/// A younger load has to wait for the memory load barrier to execute.
/// A load/store barrier is "executed" when it becomes the oldest entry in
/// the load/store queue(s). That also means, all the older loads/stores have
/// already been executed.
class LSUnit : public LSUnitBase {
// This class doesn't know about the latency of a load instruction. So, it
// conservatively/pessimistically assumes that the latency of a load opcode
// matches the instruction latency.
//
// FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses),
// and load/store conflicts, the latency of a load is determined by the depth
// of the load pipeline. So, we could use field `LoadLatency` in the
// MCSchedModel to model that latency.
// Field `LoadLatency` often matches the so-called 'load-to-use' latency from
// L1D, and it usually already accounts for any extra latency due to data
// forwarding.
// When doing throughput analysis, `LoadLatency` is likely to
// be a better predictor of load latency than instruction latency. This is
// particularly true when simulating code with temporal/spatial locality of
// memory accesses.
// Using `LoadLatency` (instead of the instruction latency) is also expected
// to improve the load queue allocation for long latency instructions with
// folded memory operands (See PR39829).
//
// FIXME: On some processors, load/store operations are split into multiple
// uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but
// not 256-bit data types. So, a 256-bit load is effectively split into two
// 128-bit loads, and each split load consumes one 'LoadQueue' entry. For
// simplicity, this class optimistically assumes that a load instruction only
// consumes one entry in the LoadQueue. Similarly, store instructions only
// consume a single entry in the StoreQueue.
// In future, we should reassess the quality of this design, and consider
// alternative approaches that let instructions specify the number of
// load/store queue entries which they consume at dispatch stage (See
// PR39830).
//
// An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is
// conservatively treated as a store barrier. It forces older store to be
// executed before newer stores are issued.
//
// An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is
// conservatively treated as a load barrier. It forces older loads to execute
// before newer loads are issued.
unsigned CurrentLoadGroupID;
unsigned CurrentLoadBarrierGroupID;
unsigned CurrentStoreGroupID;
public:
LSUnit(const MCSchedModel &SM)
: LSUnit(SM, /* LQSize */ 0, /* SQSize */ 0, /* NoAlias */ false) {}
LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ)
: LSUnit(SM, LQ, SQ, /* NoAlias */ false) {}
LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias)
: LSUnitBase(SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(0),
CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0) {}
/// Returns LSU_AVAILABLE if there are enough load/store queue entries to
/// accomodate instruction IR.
Status isAvailable(const InstRef &IR) const override;
/// Allocates LS resources for instruction IR.
///
/// This method assumes that a previous call to `isAvailable(IR)` succeeded
/// returning LSU_AVAILABLE.
///
/// Rules are:
/// By default, rules are:
/// 1. A store may not pass a previous store.
/// 2. A load may not pass a previous store unless flag 'NoAlias' is set.
/// 3. A load may pass a previous load.
/// 4. A store may not pass a previous load (regardless of flag 'NoAlias').
/// 5. A load has to wait until an older load barrier is fully executed.
/// 6. A store has to wait until an older store barrier is fully executed.
unsigned dispatch(const InstRef &IR) override;
void onInstructionExecuted(const InstRef &IR) override;
};
} // namespace mca
} // namespace llvm
#endif // LLVM_MCA_LSUNIT_H
|