- Notifications
You must be signed in to change notification settings - Fork 1.4k
/
Copy pathnode.go
293 lines (262 loc) · 9.72 KB
/
node.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
// Copyright 2022 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package lisafs
import (
"gvisor.dev/gvisor/pkg/atomicbitops"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sync"
)
// numStaticChildren is the number of static children tracked by each node.
// Sampling certain filesystem heavy workloads showed that a majority of
// directories store at most 5 children in their map. This should be kept low
// to minimize the memory overhead for each node. 5 is fairly low and at the
// same time helps avoid map allocations for majority of nodes. Benchmarking
// also showed that static arrays are faster than maps for lookups until n=8.
constnumStaticChildren=5
// Node is a node on the filesystem tree. A Node is shared by all the
// ControlFDs opened on that position. For a given Server, there will only be
// one Node for a given filesystem position.
//
// Reference Model:
// - Each node holds a ref on its parent for its entire lifetime.
typeNodestruct {
// node's ref count is protected by its parent's childrenMu.
nodeRefs
// opMu synchronizes high level operations on this path.
//
// It is used to ensure the following which are important for security:
// * This node's data is protected by opMu. So all operations that change its
// data should hold opMu for writing. For example: write, setstat, setxattr,
// etc. This entails that if this node represents a directory, creation and
// deletion operations happening directly under this directory must lock
// opMu for writing. All operations accessing data must hold opMu for
// reading. This is to avoid the can of worms that open when creation and
// deletion are allowed to race. This prevents any walks from occurring
// during creation or deletion.
// * When this node is being deleted, the deletion handler must hold opMu for
// writing. This ensures that there are no concurrent operations going on
// this node while it is being deleted and potentially being replaced with
// something hazardous.
//
// A useful consequence of the above is that holding opMu for reading
// guarantees that the Server can not change Nodes on the path until this
// Node. For instance, if the grandparent needs to be renamed or deleted,
// the client must first delete this node to avoid ENOTEMPTY error. Deleting
// this node is not possible while opMu is read locked.
opMu sync.RWMutex
// deleted indicates whether the backing file has been unlinked. This can be
// used to deny operations on FDs on this Node after deletion because it is
// not safe for FD implementations to do host walks up to this position
// anymore. This node may have been replaced with something hazardous.
// deleted is protected by opMu. deleted must only be accessed/mutated using
// atomics; see markDeletedRecursive for more details.
deleted atomicbitops.Uint32
// name is the name of the file represented by this Node in parent. If this
// FD represents the root directory, then name is an empty string. name is
// protected by the backing server's rename mutex.
namestring
// parent is this parent node which tracks this node as a child. parent is
// protected by the backing server's rename mutex.
parent*Node
// controlFDs is a linked list of all the ControlFDs opened on this node.
// Prefer this over a slice to avoid additional allocations. Each ControlFD
// is an implicit linked list node so there are no additional allocations
// needed to maintain the linked list.
controlFDsMu sync.Mutex
controlFDscontrolFDList
// Here is a performance hack. Past experience has shown that map allocations
// on each node for tracking children costs a lot of memory. More small
// allocations also fragment memory. To save allocations, statically track
// upto numStaticChildren children using hardcoded pointers. If more children
// are inserted then move to a map. Use dynamicChildren iff it is non-nil.
// The following fields are protected by childrenMu.
childrenMu sync.Mutex
staticChildren [numStaticChildren]struct {
namestring
node*Node
}
dynamicChildrenmap[string]*Node
}
// DecRef implements refs.RefCounter.DecRef. Note that the context
// parameter should never be used. It exists solely to comply with the
// refs.RefCounter interface.
//
// Precondition: server's rename mutex must be at least read locked.
func (n*Node) DecRef(context.Context) {
ifn.parent==nil {
n.nodeRefs.DecRef(nil)
return
}
// If this is the only ref on node then it will need to be destroyed.
n.parent.childrenMu.Lock()
deleted:=false
n.nodeRefs.DecRef(func() {
n.parent.removeChildLocked(n.name)
deleted=true
})
n.parent.childrenMu.Unlock()
ifdeleted {
// Drop ref on parent. Keep Decref call lock free for scalability.
n.parent.DecRef(nil)
}
}
// InitLocked must be called before first use of fd.
//
// Precondition: parent.childrenMu is locked.
//
// Postconditions: A ref on n is transferred to the caller.
func (n*Node) InitLocked(namestring, parent*Node) {
n.nodeRefs.InitRefs()
n.name=name
n.parent=parent
ifparent!=nil {
parent.IncRef()
parent.insertChildLocked(name, n)
}
}
// LookupChildLocked looks up for a child with given name. Returns nil if child
// does not exist.
//
// Preconditions: childrenMu is locked.
func (n*Node) LookupChildLocked(namestring) *Node {
ifn.dynamicChildren!=nil {
returnn.dynamicChildren[name]
}
fori:=0; i<numStaticChildren; i++ {
ifn.staticChildren[i].name==name {
returnn.staticChildren[i].node
}
}
returnnil
}
// WithChildrenMu executes fn with n.childrenMu locked.
func (n*Node) WithChildrenMu(fnfunc()) {
n.childrenMu.Lock()
defern.childrenMu.Unlock()
fn()
}
// FilePath returns the absolute path of the backing file. This is an expensive
// operation. The returned path should be free of any intermediate symlinks
// because all internal (non-leaf) nodes are directories.
//
// Precondition:
// - server's rename mutex must be at least read locked. Calling handlers must
// at least have read concurrency guarantee from the server.
func (n*Node) FilePath() string {
// Walk upwards and prepend name to res.
varres fspath.Builder
forn.parent!=nil {
res.PrependComponent(n.name)
n=n.parent
}
// n is the root node.
res.PrependByte('/')
returnres.String()
}
func (n*Node) isDeleted() bool {
returnn.deleted.Load() !=0
}
func (n*Node) removeFD(fd*ControlFD) {
n.controlFDsMu.Lock()
defern.controlFDsMu.Unlock()
n.controlFDs.Remove(fd)
}
func (n*Node) insertFD(fd*ControlFD) {
n.controlFDsMu.Lock()
defern.controlFDsMu.Unlock()
n.controlFDs.PushBack(fd)
}
func (n*Node) forEachFD(fnfunc(*ControlFD)) {
n.controlFDsMu.Lock()
defern.controlFDsMu.Unlock()
forfd:=n.controlFDs.Front(); fd!=nil; fd=fd.Next() {
fn(fd)
}
}
// removeChildLocked removes child with given name from n and returns the
// removed child. Returns nil if no such child existed.
//
// Precondition: childrenMu is locked.
func (n*Node) removeChildLocked(namestring) *Node {
ifn.dynamicChildren!=nil {
toRemove:=n.dynamicChildren[name]
delete(n.dynamicChildren, name)
returntoRemove
}
fori:=0; i<numStaticChildren; i++ {
ifn.staticChildren[i].name==name {
toRemove:=n.staticChildren[i].node
n.staticChildren[i].name=""
n.staticChildren[i].node=nil
returntoRemove
}
}
returnnil
}
// insertChildLocked inserts child into n. It does not check for duplicates.
//
// Precondition: childrenMu is locked.
func (n*Node) insertChildLocked(namestring, child*Node) {
// Try to insert statically first if staticChildren is still being used.
ifn.dynamicChildren==nil {
fori:=0; i<numStaticChildren; i++ {
ifn.staticChildren[i].node==nil {
n.staticChildren[i].node=child
n.staticChildren[i].name=name
return
}
}
// Ran out of static space. Need to start inserting dynamically.
// Shift everything to the map.
n.dynamicChildren=make(map[string]*Node)
fori:=0; i<numStaticChildren; i++ {
// From above loop we know all staticChildren entries are non-nil.
n.dynamicChildren[n.staticChildren[i].name] =n.staticChildren[i].node
n.staticChildren[i].name=""
n.staticChildren[i].node=nil
}
}
n.dynamicChildren[name] =child
}
func (n*Node) forEachChild(fnfunc(*Node)) {
n.childrenMu.Lock()
defern.childrenMu.Unlock()
ifn.dynamicChildren!=nil {
for_, child:=rangen.dynamicChildren {
fn(child)
}
return
}
fori:=0; i<numStaticChildren; i++ {
ifn.staticChildren[i].node!=nil {
fn(n.staticChildren[i].node)
}
}
}
// Precondition: opMu must be locked for writing on the root node being marked
// as deleted.
func (n*Node) markDeletedRecursive() {
n.deleted.Store(1)
// No need to hold opMu for children as it introduces lock ordering issues
// because forEachChild locks childrenMu. Locking opMu after childrenMu
// violates the lock ordering. Anyway if a directory is being deleted, it
// must not have children. The client must have already deleted the entire
// subtree. If the client did not delete this subtree nodes, then the subtree
// was deleted externally and there is not much we can do. This is best
// effort work to mark the subtree as deleted.
n.forEachChild(func(child*Node) {
child.markDeletedRecursive()
})
}