Source File
malloc_stubs.go
Belonging Package
runtime
// Copyright 2025 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.// This file contains stub functions that are not meant to be called directly,// but that will be assembled together using the inlining logic in runtime/_mkmalloc// to produce a full mallocgc function that's specialized for a span class// or specific size in the case of the tiny allocator.//// To generate the specialized mallocgc functions, do 'go run .' inside runtime/_mkmalloc.//// To assemble a mallocgc function, the mallocStub function is cloned, and the call to// inlinedMalloc is replaced with the inlined body of smallScanNoHeaderStub,// smallNoScanStub or tinyStub, depending on the parameters being specialized.//// The size_ (for the tiny case) and elemsize_, sizeclass_, and noscanint_ (for all three cases)// identifiers are replaced with the value of the parameter in the specialized case.// The nextFreeFastStub, nextFreeFastTiny, heapSetTypeNoHeaderStub, and writeHeapBitsSmallStub// functions are also inlined by _mkmalloc.package runtimeimport ()// These identifiers will all be replaced by the inliner. So their values don't// really matter: they just need to be set so that the stub functions, which// will never be used on their own, can compile. elemsize_ can't be set to// zero because we divide by it in nextFreeFastTiny, and the compiler would// complain about a division by zero. Its replaced value will always be greater// than zero.const elemsize_ = 8const sizeclass_ = 0const noscanint_ = 0const size_ = 0const isTiny_ = falsefunc malloc0( uintptr, *_type, bool) unsafe.Pointer {if doubleCheckMalloc {if gcphase == _GCmarktermination {throw("mallocgc called with gcphase == _GCmarktermination")}}// Short-circuit zero-sized allocation requests.return unsafe.Pointer(&zerobase)}func mallocPanic( uintptr, *_type, bool) unsafe.Pointer {panic("not defined for sizeclass")}// WARNING: mallocStub does not do any work for sanitizers so callers need// to steer out of this codepath early if sanitizers are enabled.func mallocStub( uintptr, *_type, bool) unsafe.Pointer {if isTiny_ {// secret code, need to avoid the tiny allocator since it might keep// co-located values alive longer and prevent timely zero-ing//// Call directly into the NoScan allocator.// See go.dev/issue/76356:= getg()if goexperiment.RuntimeSecret && .secret > 0 {return mallocgcSmallNoScanSC2(, , )}}if doubleCheckMalloc {if gcphase == _GCmarktermination {throw("mallocgc called with gcphase == _GCmarktermination")}}// It's possible for any malloc to trigger sweeping, which may in// turn queue finalizers. Record this dynamic lock edge.// N.B. Compiled away if lockrank experiment is not enabled.lockRankMayQueueFinalizer()// Pre-malloc debug hooks.if debug.malloc {if := preMallocgcDebug(, ); != nil {return}}// Assist the GC if needed. (On the reuse path, we currently compensate for this;// changes here might require changes there.)if gcBlackenEnabled != 0 {deductAssistCredit()}// Actually do the allocation., := inlinedMalloc(, , )if !isTiny_ {:= getg()if goexperiment.RuntimeSecret && .secret > 0 {// Mark any object allocated while in secret mode as secret.// This ensures we zero it immediately when freeing it.addSecret()}}// Notify valgrind, if enabled.// To allow the compiler to not know about valgrind, we do valgrind instrumentation// unlike the other sanitizers.if valgrindenabled {valgrindMalloc(, )}// Adjust our GC assist debt to account for internal fragmentation.if gcBlackenEnabled != 0 && != 0 {if := getg().m.curg; != nil {.gcAssistBytes -= int64( - )}}// Post-malloc debug hooks.if debug.malloc {postMallocgcDebug(, , )}return}// inlinedMalloc will never be called. It is defined just so that the compiler can compile// the mallocStub function, which will also never be called, but instead used as a template// to generate a size-specialized malloc function. The call to inlinedMalloc in mallocStub// will be replaced with the inlined body of smallScanNoHeaderStub, smallNoScanStub, or tinyStub// when generating the size-specialized malloc function. See the comment at the top of this// file for more information.func inlinedMalloc( uintptr, *_type, bool) (unsafe.Pointer, uintptr) {return unsafe.Pointer(uintptr(0)), 0}func doubleCheckSmallScanNoHeader( uintptr, *_type, *m) {if .mallocing != 0 {throw("malloc deadlock")}if .gsignal == getg() {throw("malloc during signal")}if == nil || !.Pointers() {throw("noscan allocated in scan-only path")}if !heapBitsInSpan() {throw("heap bits in not in span for non-header-only path")}}func smallScanNoHeaderStub( uintptr, *_type, bool) (unsafe.Pointer, uintptr) {const = sizeclass_const = elemsize_// Set mp.mallocing to keep from being preempted by GC.:= acquirem()if doubleCheckMalloc {doubleCheckSmallScanNoHeader(, , )}.mallocing = 1:= false:= getMCache()const = spanClass(<<1) | spanClass(noscanint_):= .alloc[]:= nextFreeFastStub()if == 0 {, , = .nextFree()}:= unsafe.Pointer()if .needzero != 0 {memclrNoHeapPointers(, )}if goarch.PtrSize == 8 && == 1 {// initHeapBits already set the pointer bits for the 8-byte sizeclass// on 64-bit platforms..scanAlloc += 8} else {:= // make the inliner happy:= uintptr():= heapSetTypeNoHeaderStub(, , , ).scanAlloc +=}// Ensure that the stores above that initialize x to// type-safe memory and set the heap bits occur before// the caller can make x observable to the garbage// collector. Otherwise, on weakly ordered machines,// the garbage collector could follow a pointer to x,// but see uninitialized memory or stale heap bits.publicationBarrier()if writeBarrier.enabled {// Allocate black during GC.// All slots hold nil so no scanning is needed.// This may be racing with GC so do it atomically if there can be// a race marking the bit.gcmarknewobject(, uintptr())} else {// Track the last free index before the mark phase. This field// is only used by the garbage collector. During the mark phase// this is used by the conservative scanner to filter out objects// that are both free and recently-allocated. It's safe to do that// because we allocate-black if the GC is enabled. The conservative// scanner produces pointers out of thin air, so without additional// synchronization it might otherwise observe a partially-initialized// object, which could crash the program..freeIndexForScan = .freeindex}// Note cache c only valid while m acquired; see #47302//// N.B. Use the full size because that matches how the GC// will update the mem profile on the "free" side.//// TODO(mknyszek): We should really count the header as part// of gc_sys or something. The code below just pretends it is// internal fragmentation and matches the GC's accounting by// using the whole allocation slot..nextSample -= int64()if .nextSample < 0 || MemProfileRate != .memProfRate {profilealloc(, , )}.mallocing = 0releasem()if {if := (gcTrigger{kind: gcTriggerHeap}); .test() {gcStart()}}return ,}func doubleCheckSmallNoScan( *_type, *m) {if .mallocing != 0 {throw("malloc deadlock")}if .gsignal == getg() {throw("malloc during signal")}if != nil && .Pointers() {throw("expected noscan type for noscan alloc")}}func smallNoScanStub( uintptr, *_type, bool) (unsafe.Pointer, uintptr) {// TODO(matloob): Add functionality to mkmalloc to allow us to inline a non-constant// sizeclass_ and elemsize_ value (instead just set to the expressions to look up the size class// and elemsize. We'd also need to teach mkmalloc that values that are touched by these (specifically// spc below) should turn into vars. This would allow us to generate mallocgcSmallNoScan itself,// so that its code could not diverge from the generated functions.const = sizeclass_const = elemsize_// Set mp.mallocing to keep from being preempted by GC.:= acquirem()if doubleCheckMalloc {doubleCheckSmallNoScan(, )}.mallocing = 1:= false:= getMCache()const = spanClass(<<1) | spanClass(noscanint_):= .alloc[]// First, check for a reusable object.if runtimeFreegcEnabled && .hasReusableNoscan() {// We have a reusable object, use it.:= mallocgcSmallNoscanReuse(, , , , ).mallocing = 0releasem()// TODO(thepudds): note that the generated return path is essentially duplicated// by the generator. For example, see the two postMallocgcDebug calls and// related duplicated code on the return path currently in the generated// mallocgcSmallNoScanSC2 function. One set of those correspond to this// return here. We might be able to de-duplicate the generated return path// by updating the generator, perhaps by jumping to a shared return or similar.return ,}:= nextFreeFastStub()if == 0 {, , = .nextFree()}:= unsafe.Pointer()if && .needzero != 0 {memclrNoHeapPointers(, )}// Ensure that the stores above that initialize x to// type-safe memory and set the heap bits occur before// the caller can make x observable to the garbage// collector. Otherwise, on weakly ordered machines,// the garbage collector could follow a pointer to x,// but see uninitialized memory or stale heap bits.publicationBarrier()if writeBarrier.enabled {// Allocate black during GC.// All slots hold nil so no scanning is needed.// This may be racing with GC so do it atomically if there can be// a race marking the bit.gcmarknewobject(, uintptr())} else {// Track the last free index before the mark phase. This field// is only used by the garbage collector. During the mark phase// this is used by the conservative scanner to filter out objects// that are both free and recently-allocated. It's safe to do that// because we allocate-black if the GC is enabled. The conservative// scanner produces pointers out of thin air, so without additional// synchronization it might otherwise observe a partially-initialized// object, which could crash the program..freeIndexForScan = .freeindex}// Note cache c only valid while m acquired; see #47302//// N.B. Use the full size because that matches how the GC// will update the mem profile on the "free" side.//// TODO(mknyszek): We should really count the header as part// of gc_sys or something. The code below just pretends it is// internal fragmentation and matches the GC's accounting by// using the whole allocation slot..nextSample -= int64()if .nextSample < 0 || MemProfileRate != .memProfRate {profilealloc(, , )}.mallocing = 0releasem()if {if := (gcTrigger{kind: gcTriggerHeap}); .test() {gcStart()}}return ,}func doubleCheckTiny( uintptr, *_type, *m) {if .mallocing != 0 {throw("malloc deadlock")}if .gsignal == getg() {throw("malloc during signal")}if != nil && .Pointers() {throw("expected noscan for tiny alloc")}}func tinyStub( uintptr, *_type, bool) (unsafe.Pointer, uintptr) {const = size_const = elemsize_// Set mp.mallocing to keep from being preempted by GC.:= acquirem()if doubleCheckMalloc {doubleCheckTiny(, , )}.mallocing = 1// Tiny allocator.//// Tiny allocator combines several tiny allocation requests// into a single memory block. The resulting memory block// is freed when all subobjects are unreachable. The subobjects// must be noscan (don't have pointers), this ensures that// the amount of potentially wasted memory is bounded.//// Size of the memory block used for combining (maxTinySize) is tunable.// Current setting is 16 bytes, which relates to 2x worst case memory// wastage (when all but one subobjects are unreachable).// 8 bytes would result in no wastage at all, but provides less// opportunities for combining.// 32 bytes provides more opportunities for combining,// but can lead to 4x worst case wastage.// The best case winning is 8x regardless of block size.//// Objects obtained from tiny allocator must not be freed explicitly.// So when an object will be freed explicitly, we ensure that// its size >= maxTinySize.//// SetFinalizer has a special case for objects potentially coming// from tiny allocator, it such case it allows to set finalizers// for an inner byte of a memory block.//// The main targets of tiny allocator are small strings and// standalone escaping variables. On a json benchmark// the allocator reduces number of allocations by ~12% and// reduces heap size by ~20%.:= getMCache():= .tinyoffset// Align tiny pointer for required (conservative) alignment.if &7 == 0 {= alignUp(, 8)} else if goarch.PtrSize == 4 && == 12 {// Conservatively align 12-byte objects to 8 bytes on 32-bit// systems so that objects whose first field is a 64-bit// value is aligned to 8 bytes and does not cause a fault on// atomic access. See issue 37262.// TODO(mknyszek): Remove this workaround if/when issue 36606// is resolved.= alignUp(, 8)} else if &3 == 0 {= alignUp(, 4)} else if &1 == 0 {= alignUp(, 2)}if + <= maxTinySize && .tiny != 0 {// The object fits into existing tiny block.:= unsafe.Pointer(.tiny + ).tinyoffset = +.tinyAllocs++.mallocing = 0releasem()return , 0}// Allocate a new maxTinySize block.:= false:= .alloc[tinySpanClass]:= nextFreeFastTiny()if == 0 {, , = .nextFree(tinySpanClass)}:= unsafe.Pointer()(*[2]uint64)()[0] = 0 // Always zero(*[2]uint64)()[1] = 0// See if we need to replace the existing tiny block with the new one// based on amount of remaining free space.if !raceenabled && ( < .tinyoffset || .tiny == 0) {// Note: disabled when race detector is on, see comment near end of this function..tiny = uintptr().tinyoffset =}// Ensure that the stores above that initialize x to// type-safe memory and set the heap bits occur before// the caller can make x observable to the garbage// collector. Otherwise, on weakly ordered machines,// the garbage collector could follow a pointer to x,// but see uninitialized memory or stale heap bits.publicationBarrier()if writeBarrier.enabled {// Allocate black during GC.// All slots hold nil so no scanning is needed.// This may be racing with GC so do it atomically if there can be// a race marking the bit.gcmarknewobject(, uintptr())} else {// Track the last free index before the mark phase. This field// is only used by the garbage collector. During the mark phase// this is used by the conservative scanner to filter out objects// that are both free and recently-allocated. It's safe to do that// because we allocate-black if the GC is enabled. The conservative// scanner produces pointers out of thin air, so without additional// synchronization it might otherwise observe a partially-initialized// object, which could crash the program..freeIndexForScan = .freeindex}// Note cache c only valid while m acquired; see #47302//// N.B. Use the full size because that matches how the GC// will update the mem profile on the "free" side.//// TODO(mknyszek): We should really count the header as part// of gc_sys or something. The code below just pretends it is// internal fragmentation and matches the GC's accounting by// using the whole allocation slot..nextSample -= int64()if .nextSample < 0 || MemProfileRate != .memProfRate {profilealloc(, , )}.mallocing = 0releasem()if {if := (gcTrigger{kind: gcTriggerHeap}); .test() {gcStart()}}if raceenabled {// Pad tinysize allocations so they are aligned with the end// of the tinyalloc region. This ensures that any arithmetic// that goes off the top end of the object will be detectable// by checkptr (issue 38872).// Note that we disable tinyalloc when raceenabled for this to work.// TODO: This padding is only performed when the race detector// is enabled. It would be nice to enable it if any package// was compiled with checkptr, but there's no easy way to// detect that (especially at compile time).// TODO: enable this padding for all allocations, not just// tinyalloc ones. It's tricky because of pointer maps.// Maybe just all noscan objects?= add(, -)}return ,}// TODO(matloob): Should we let the go compiler inline this instead of using mkmalloc?// We won't be able to use elemsize_ but that's probably ok.func nextFreeFastTiny( *mspan) gclinkptr {const = 8192const = uint16(( - unsafe.Sizeof(spanInlineMarkBits{})) / elemsize_)var gclinkptrif .allocCache != 0 {:= sys.TrailingZeros64(.allocCache) // Is there a free object in the allocCache?:= .freeindex + uint16()if < {:= + 1if !(%64 == 0 && != ) {.allocCache >>= uint( + 1).freeindex =.allocCount++= gclinkptr(uintptr()*elemsize_ + .base())}}}return}func nextFreeFastStub( *mspan) gclinkptr {var gclinkptrif .allocCache != 0 {:= sys.TrailingZeros64(.allocCache) // Is there a free object in the allocCache?:= .freeindex + uint16()if < .nelems {:= + 1if !(%64 == 0 && != .nelems) {.allocCache >>= uint( + 1).freeindex =.allocCount++= gclinkptr(uintptr()*elemsize_ + .base())}}}return}func heapSetTypeNoHeaderStub(, uintptr, *_type, *mspan) uintptr {if doubleCheckHeapSetType && (!heapBitsInSpan() || !heapBitsInSpan(elemsize_)) {throw("tried to write heap bits, but no heap bits in span")}:= writeHeapBitsSmallStub(, , , )if doubleCheckHeapSetType {doubleCheckHeapType(, , , nil, )}return}// writeHeapBitsSmallStub writes the heap bits for small objects whose ptr/scalar data is// stored as a bitmap at the end of the span.//// Assumes dataSize is <= ptrBits*goarch.PtrSize. x must be a pointer into the span.// heapBitsInSpan(dataSize) must be true. dataSize must be >= typ.Size_.////go:nosplitfunc writeHeapBitsSmallStub( *mspan, , uintptr, *_type) uintptr {// The objects here are always really small, so a single load is sufficient.:= readUintptr(getGCMask())const = elemsize_// Create repetitions of the bitmap if we have a small slice backing store.:= .PtrBytes:=if .Size_ == goarch.PtrSize {= (1 << ( / goarch.PtrSize)) - 1} else {// N.B. We rely on dataSize being an exact multiple of the type size.// The alternative is to be defensive and mask out src to the length// of dataSize. The purpose is to save on one additional masking operation.if doubleCheckHeapSetType && !asanenabled && %.Size_ != 0 {throw("runtime: (*mspan).writeHeapBitsSmall: dataSize is not a multiple of typ.Size_")}for := .Size_; < ; += .Size_ {|= << ( / goarch.PtrSize)+= .Size_}}// Since we're never writing more than one uintptr's worth of bits, we're either going// to do one or two writes., := spanHeapBitsRange(.base(), pageSize, ):= unsafe.Pointer():= ( - .base()) / goarch.PtrSize:= / ptrBits:= % ptrBitsconst uintptr = / goarch.PtrSize// In the if statement below, we have to do two uintptr writes if the bits// we need to write straddle across two different memory locations. But if// the number of bits we're writing divides evenly into the number of bits// in the uintptr we're writing, this can never happen. Since bitsIsPowerOfTwo// is a compile-time constant in the generated code, in the case where the size is// a power of two less than or equal to ptrBits, the compiler can remove the// 'two writes' branch of the if statement and always do only one write without// the check.const = &(-1) == 0if > ptrBits || (! && + > ptrBits) {// Two writes.:= ptrBits -:= -:= (*uintptr)(add(, (+0)*goarch.PtrSize)):= (*uintptr)(add(, (+1)*goarch.PtrSize))* = (*)&(^uintptr(0)>>) | ( << )* = (*)&^((1<<)-1) | ( >> )} else {// One write.:= (*uintptr)(add(, *goarch.PtrSize))* = (*)&^(((1<<(min(, ptrBits)))-1)<<) | ( << ) // We're taking the min so this compiles on 32 bit platforms. But if bits > ptrbits we always take the other branch}const = falseif {writeHeapBitsDoubleCheck(, , , , , , , , )}return}func writeHeapBitsDoubleCheck( *mspan, , , , , , , uintptr, *_type) {:= .heapBitsSmallForAddr()if != {print("runtime: x=", hex(), " i=", , " j=", , " bits=", , "\n")print("runtime: dataSize=", , " typ.Size_=", .Size_, " typ.PtrBytes=", .PtrBytes, "\n")print("runtime: src0=", hex(), " src=", hex(), " srcRead=", hex(), "\n")throw("bad pointer bits written for small object")}}
![]() |
The pages are generated with Golds v0.8.3-preview. (GOOS=linux GOARCH=amd64) Golds is a Go 101 project developed by Tapir Liu. PR and bug reports are welcome and can be submitted to the issue list. Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds. |