NativeArray<T> is a new type introduced recently in Unity 2018.1. It’s like List<T> except it’s backed by an unmanaged array instead of a managed array. It’s also a struct instead of a class. This means it creates no garbage for the GC to later collect. That’s the surface level description, but today we’ll go in depth to find out how it really works and learn some interesting facts along the way.

Constructors

Let’s start off by simply creating a NativeArray<int> in the most straightforward, usual way:

public static class TestClass
{
	public static void Create(int len)
	{
		new NativeArray<int>(len, Allocator.Persistent);
	}
}

Now let’s look at the C++ that IL2CPP in Unity 2018.1.0.f2 outputs for this function:

extern "C"  void TestClass_Create_m1711344578 (RuntimeObject * __this /* static, unused */, int32_t ___len0, const RuntimeMethod* method)
{
	static bool s_Il2CppMethodInitialized;
	if (!s_Il2CppMethodInitialized)
	{
		il2cpp_codegen_initialize_method (TestClass_Create_m1711344578_MetadataUsageId);
		s_Il2CppMethodInitialized = true;
	}
	{
		int32_t L_0 = ___len0;
		NativeArray_1_t3237678471  L_1;
		memset(&L_1, 0, sizeof(L_1));
		NativeArray_1__ctor_m150647745((&L_1), L_0, 4, 1, /*hidden argument*/NativeArray_1__ctor_m150647745_RuntimeMethod_var);
		return;
	}
}

Since NativeArray<T> is a generic type, we get method initialization overhead. After that, we see a NativeArray_1_t3237678471-typed local variable, which is equivalent to NativeArray<int>. Its contents are cleared to all zeroes with memset to set all fields to their default values. Let’s look at the type to see what it contains:

struct  NativeArray_1_t3237678471 
{
public:
	// System.Void* Unity.Collections.NativeArray`1::m_Buffer
	void* ___m_Buffer_0;
	// System.Int32 Unity.Collections.NativeArray`1::m_Length
	int32_t ___m_Length_1;
	// Unity.Collections.Allocator Unity.Collections.NativeArray`1::m_AllocatorLabel
	int32_t ___m_AllocatorLabel_2;
 
public:
	// [Jackson: removed a lot of field accessor methods]
};

It’s clear from this that a NativeArray<T> consists of a pointer to the unmanaged array’s memory (___m_Buffer_0), the length of the array ___m_Length_1, and the type of allocator (e.g. Allocator.Persistent) used to allocate the buffer (___m_AllocatorLabel_2).

Finally in our test function, the constructor is called: NativeArray_1__ctor_m150647745. Let’s take a look at it:

#define NativeArray_1__ctor_m150647745(__this, p0, p1, p2, method) ((  void (*) (NativeArray_1_t3237678471 *, int32_t, int32_t, int32_t, const RuntimeMethod*))NativeArray_1__ctor_m150647745_gshared)(__this, p0, p1, p2, method)
 
extern "C"  void NativeArray_1__ctor_m150647745_gshared (NativeArray_1_t3237678471 * __this, int32_t ___length0, int32_t ___allocator1, int32_t ___options2, const RuntimeMethod* method)
{
	{
		int32_t L_0 = ___length0;
		int32_t L_1 = ___allocator1;
		((  void (*) (RuntimeObject * /* static, unused */, int32_t, int32_t, NativeArray_1_t3237678471 *, const RuntimeMethod*))IL2CPP_RGCTX_METHOD_INFO(InitializedTypeInfo(method->klass)->rgctx_data, 0)->methodPointer)(NULL /*static, unused*/, (int32_t)L_0, (int32_t)L_1, (NativeArray_1_t3237678471 *)(NativeArray_1_t3237678471 *)__this, /*hidden argument*/IL2CPP_RGCTX_METHOD_INFO(InitializedTypeInfo(method->klass)->rgctx_data, 0));
		int32_t L_2 = ___options2;
		if ((!(((uint32_t)((int32_t)((int32_t)L_2&(int32_t)1))) == ((uint32_t)1))))
		{
			goto IL_002b;
		}
	}
	{
		void* L_3 = (void*)__this->get_m_Buffer_0();
		int32_t L_4 = IL2CPP_NATIVEARRAY_GET_LENGTH(((NativeArray_1_t3237678471 *)(NativeArray_1_t3237678471 *)__this)->___m_Length_1);
		int32_t L_5 = ((  int32_t (*) (RuntimeObject * /* static, unused */, const RuntimeMethod*))IL2CPP_RGCTX_METHOD_INFO(InitializedTypeInfo(method->klass)->rgctx_data, 3)->methodPointer)(NULL /*static, unused*/, /*hidden argument*/IL2CPP_RGCTX_METHOD_INFO(InitializedTypeInfo(method->klass)->rgctx_data, 3));
		UnsafeUtility_MemClear_m2803322643(NULL /*static, unused*/, (void*)(void*)L_3, (int64_t)((int64_t)il2cpp_codegen_multiply((int64_t)(((int64_t)((int64_t)L_4))), (int64_t)(((int64_t)((int64_t)L_5))))), /*hidden argument*/NULL);
	}
 
IL_002b:
	{
		return;
	}
}

The constructor that’s called is actually a macro that calls the real constructor function through a global pointer: NativeArray_1__ctor_m150647745_gshared. Inside that function we immediately see a very long line that’s getting a pointer to the static Allocate method and calling it. The if after that is checking if the default options parameter doesn’t have a bit set indicating that we want to clear the unmanaged array’s memory to all zeroes. If so, the constructor is done. If not, another super long line is used to get a pointer to the static UnsafeUtility.MemClear function and call it.

So what do these Allocat and MemClear functions do? To find out, let’s start by looking at Allocate:

extern "C"  void NativeArray_1_Allocate_m1826258112_gshared (RuntimeObject * __this /* static, unused */, int32_t ___length0, int32_t ___allocator1, NativeArray_1_t3237678471 * ___array2, const RuntimeMethod* method)
{
	int64_t V_0 = 0;
	{
		int32_t L_0 = ((  int32_t (*) (RuntimeObject * /* static, unused */, const RuntimeMethod*))IL2CPP_RGCTX_METHOD_INFO(InitializedTypeInfo(method->klass)->rgctx_data, 3)->methodPointer)(NULL /*static, unused*/, /*hidden argument*/IL2CPP_RGCTX_METHOD_INFO(InitializedTypeInfo(method->klass)->rgctx_data, 3));
		int32_t L_1 = ___length0;
		V_0 = (int64_t)((int64_t)il2cpp_codegen_multiply((int64_t)(((int64_t)((int64_t)L_0))), (int64_t)(((int64_t)((int64_t)L_1)))));
		NativeArray_1_t3237678471 * L_2 = ___array2;
		int64_t L_3 = V_0;
		int32_t L_4 = ((  int32_t (*) (RuntimeObject * /* static, unused */, const RuntimeMethod*))IL2CPP_RGCTX_METHOD_INFO(InitializedTypeInfo(method->klass)->rgctx_data, 5)->methodPointer)(NULL /*static, unused*/, /*hidden argument*/IL2CPP_RGCTX_METHOD_INFO(InitializedTypeInfo(method->klass)->rgctx_data, 5));
		int32_t L_5 = ___allocator1;
		void* L_6 = UnsafeUtility_Malloc_m4180434614(NULL /*static, unused*/, (int64_t)L_3, (int32_t)L_4, (int32_t)L_5, /*hidden argument*/NULL);
		L_2->set_m_Buffer_0((void*)L_6);
		NativeArray_1_t3237678471 * L_7 = ___array2;
		int32_t L_8 = ___length0;
		L_7->set_m_Length_1(L_8);
		NativeArray_1_t3237678471 * L_9 = ___array2;
		int32_t L_10 = ___allocator1;
		L_9->set_m_AllocatorLabel_2(L_10);
		return;
	}
}

To start, we see another super long line calling UnsafeUtility.SizeOf<T>(). Then there’s another super long line to call UnsafeUtility.AlignOf<T>(). Then there’s the call to UnsafeUtility.Malloc. Finally, the buffer, length, and allocator are set. Again, we need to follow up by looking at these functions. Let’s start with UnsafeUtility.SizeOf<T>():

extern "C"  int32_t UnsafeUtility_SizeOf_TisInt32_t2950945753_m3179273791_gshared (RuntimeObject * __this /* static, unused */, const RuntimeMethod* method)
{
	{
		uint32_t L_0 = sizeof(int32_t);
		return L_0;
	}
}

As advertised, this is literally just returning the constant size of an integer. Next, let’s look at UnsafeUtility.AlignOf<T>():

extern "C"  int32_t UnsafeUtility_AlignOf_TisInt32_t2950945753_m4132152064_gshared (RuntimeObject * __this /* static, unused */, const RuntimeMethod* method)
{
	{
		return ((int32_t)4);
	}
}

This also just returns a 4 constant. Let’s look at something more interesting: UnsafeUtility.Malloc

extern "C"  void* UnsafeUtility_Malloc_m4180434614 (RuntimeObject * __this /* static, unused */, int64_t ___size0, int32_t ___alignment1, int32_t ___allocator2, const RuntimeMethod* method)
{
	typedef void* (*UnsafeUtility_Malloc_m4180434614_ftn) (int64_t, int32_t, int32_t);
	static UnsafeUtility_Malloc_m4180434614_ftn _il2cpp_icall_func;
	if (!_il2cpp_icall_func)
	_il2cpp_icall_func = (UnsafeUtility_Malloc_m4180434614_ftn)il2cpp_codegen_resolve_icall ("Unity.Collections.LowLevel.Unsafe.UnsafeUtility::Malloc(System.Int64,System.Int32,Unity.Collections.Allocator)");
	void* retVal = _il2cpp_icall_func(___size0, ___alignment1, ___allocator2);
	return retVal;
}

This code has its own method initialization overhead, but in a more manual way. The _il2cpp_icall_func variable is checked on every invocation of the function and set on the first invocation to be the result of a dynamic lookup that searches for Unity.Collections.LowLevel.Unsafe.UnsafeUtility::Malloc by string. To see how that’s implemented, we open the Unity installation and look into libil2cpp’s InternalCalls.cpp:

typedef std::map<std::string, Il2CppMethodPointer> ICallMap;
static ICallMap s_InternalCalls;
 
Il2CppMethodPointer InternalCalls::Resolve(const char* name)
{
    // Try to find the whole name first, then search using just type::method
    // if parameters were passed
    // ex: First, System.Foo::Bar(System.Int32)
    // Then, System.Foo::Bar
    ICallMap::iterator res = s_InternalCalls.find(name);
 
    if (res != s_InternalCalls.end())
        return res->second;
 
    std::string shortName(name);
    size_t index = shortName.find('(');
 
    if (index != std::string::npos)
    {
        shortName = shortName.substr(0, index);
        res = s_InternalCalls.find(shortName);
 
        if (res != s_InternalCalls.end())
            return res->second;
    }
 
    return NULL;
}

This code creates a std::string out of the string literal and uses it to search a std::map (a tree structure) for the function pointer that really implements Malloc. This function is inside the Unity engine, so we can’t see how it works.

Jumping all the way back to the constructor, there was an optional call to UnsafeUtility.MemClear. Let’s check it out to see how it works:

extern "C"  void UnsafeUtility_MemClear_m2803322643 (RuntimeObject * __this /* static, unused */, void* ___destination0, int64_t ___size1, const RuntimeMethod* method)
{
	typedef void (*UnsafeUtility_MemClear_m2803322643_ftn) (void*, int64_t);
	static UnsafeUtility_MemClear_m2803322643_ftn _il2cpp_icall_func;
	if (!_il2cpp_icall_func)
	_il2cpp_icall_func = (UnsafeUtility_MemClear_m2803322643_ftn)il2cpp_codegen_resolve_icall ("Unity.Collections.LowLevel.Unsafe.UnsafeUtility::MemClear(System.Void*,System.Int64)");
	_il2cpp_icall_func(___destination0, ___size1);
}

This is just like with Malloc: a dynamic lookup into an internal Unity engine function that we can’t see.

Conclusion: Constructing a NativeArray<T> should normally be reasonably fast. There’s a little fluff in there for method initialization overhead and some extra function calls, but the important parts (Malloc and MemClear) are presumably handled efficiently inside the Unity engine.

Indexer Reads and Writes

Reading and writing elements is the main reason to create a NativeArray<T> in the first place, so let’s look at that starting with the reading side:

public static class TestClass
{
	public static int IndexerGet(NativeArray<int> nativeArray, int index)
	{
		return nativeArray[index];
	}
}

Here’s the IL2CPP output for this:

extern "C"  int32_t TestClass_IndexerGet_m164063199 (RuntimeObject * __this /* static, unused */, NativeArray_1_t3237678471  ___nativeArray0, int32_t ___index1, const RuntimeMethod* method)
{
	{
		int32_t L_0 = ___index1;
		int32_t L_1 = IL2CPP_NATIVEARRAY_GET_ITEM(int32_t, ((NativeArray_1_t3237678471 *)(&___nativeArray0))->___m_Buffer_0, L_0);
		return L_1;
	}
}

This should be a surprising output! Normally we’d see method initialization overhead because we’re using a generic type. We’d also normally see a call to the indexer’s get function. Instead, we just see a macro call: IL2CPP_NATIVEARRAY_GET_ITEM. Let’s look at that macro:

#define IL2CPP_NATIVEARRAY_GET_ITEM(TElementType, TTField, TIndex) \
    *(reinterpret_cast<TElementType*>(TTField) + TIndex)

This is just indexing into the unmanaged array. Let’s look at the ARM64 assembly that Xcode 9.3 compiles this function to and see just how minimal the code is:

	ldr	w0, [x1, w3, sxtw #2]
	ret

The first instruction loads from an offset into the array and the second returns from the function.

Now let’s look at the writing side by using the set part of the indexer:

public static class TestClass
{
	public static void IndexerSet(NativeArray<int> nativeArray, int index, int val)
	{
		nativeArray[index] = val;
	}
}

Here’s the IL2CPP output for this:

extern "C"  void TestClass_IndexerSet_m317380497 (RuntimeObject * __this /* static, unused */, NativeArray_1_t3237678471  ___nativeArray0, int32_t ___index1, int32_t ___val2, const RuntimeMethod* method)
{
	{
		int32_t L_0 = ___index1;
		int32_t L_1 = ___val2;
		IL2CPP_NATIVEARRAY_SET_ITEM(int32_t, ((NativeArray_1_t3237678471 *)(&___nativeArray0))->___m_Buffer_0, L_0, L_1);
		return;
	}
}

Again we see a single macro call with no method initialization overhead or function calls. Let’s check out the macro:

#define IL2CPP_NATIVEARRAY_SET_ITEM(TElementType, TTField, TIndex, TValue) \
   *(reinterpret_cast<TElementType*>(TTField) + TIndex) = TValue;

This is just like the other macro, except it’s writing the array element instead of reading. Just to confirm that this is efficiently compiled, let’s look at the assembly:

	str	w4, [x1, w3, sxtw #2]
	ret

Again there are only two functions: store at an offset from the array and return from the function.

Conclusion: Reading from or writing to a NativeArray<T> using its indexer is completely optimal due to IL2CPP’s special-case code generation using direct reads and writes.

Length

Perhaps the next most common usage of NativeArray<T> is to query its length, such as in a loop. Let’s try using the Length property’s get function:

public static class TestClass
{
	public static int Length(NativeArray<int> nativeArray)
	{
		return nativeArray.Length;
	}
}

Here’s the C++ that IL2CPP generates:

extern "C"  int32_t TestClass_Length_m3693338766 (RuntimeObject * __this /* static, unused */, NativeArray_1_t3237678471  ___nativeArray0, const RuntimeMethod* method)
{
	{
		int32_t L_0 = IL2CPP_NATIVEARRAY_GET_LENGTH(((NativeArray_1_t3237678471 *)(&___nativeArray0))->___m_Length_1);
		return L_0;
	}
}

Like with the indexer, IL2CPP is generating a macro call involving the length field we saw earlier: ___m_Length_1. There’s no method initialization overhead for using a generic type and there’s no call to a function for the property. Let’s look at the macro:

#define IL2CPP_NATIVEARRAY_GET_LENGTH(TLengthField) \
   (TLengthField)

This is the identity macro, which means the function is just accessing the length field.

Conclusion: The Length property’s get is completely optimal due to IL2CPP generating a simple field access without any of the usual overhead.

For Loops

Putting these together, let’s write a for loop:

public static class TestClass
{
	public static int For(NativeArray<int> nativeArray)
	{
		int sum = 0;
		for (int i = 0, len = nativeArray.Length; i < len; ++i)
		{
			sum += nativeArray[i];
		}
		return sum;
	}
}

Here’s the IL2CPP output:

extern "C"  int32_t TestClass_For_m2039679413 (RuntimeObject * __this /* static, unused */, NativeArray_1_t3237678471  ___nativeArray0, const RuntimeMethod* method)
{
	int32_t V_0 = 0;
	int32_t V_1 = 0;
	int32_t V_2 = 0;
	{
		V_0 = 0;
		V_1 = 0;
		int32_t L_0 = IL2CPP_NATIVEARRAY_GET_LENGTH(((NativeArray_1_t3237678471 *)(&___nativeArray0))->___m_Length_1);
		V_2 = L_0;
		goto IL_0020;
	}
 
IL_0011:
	{
		int32_t L_1 = V_0;
		int32_t L_2 = V_1;
		int32_t L_3 = IL2CPP_NATIVEARRAY_GET_ITEM(int32_t, ((NativeArray_1_t3237678471 *)(&___nativeArray0))->___m_Buffer_0, L_2);
		V_0 = ((int32_t)il2cpp_codegen_add((int32_t)L_1, (int32_t)L_3));
		int32_t L_4 = V_1;
		V_1 = ((int32_t)il2cpp_codegen_add((int32_t)L_4, (int32_t)1));
	}
 
IL_0020:
	{
		int32_t L_5 = V_1;
		int32_t L_6 = V_2;
		if ((((int32_t)L_5) < ((int32_t)L_6)))
		{
			goto IL_0011;
		}
	}
	{
		int32_t L_7 = V_0;
		return L_7;
	}
}

There’s also no method initialization overhead here as we’re just using the macro-based functionality. As usual, IL2CPP generates goto-based flow control so it’s harder to read than a simple for loop, but this is basically just what we’d expect. There are also the usual calls to il2cpp_codegen_add instead of using the + operator, but that’s more or less equivalent. Let’s look at the assembly this compiles to:

	cmp		w2, #1
	b.lt	LBB6_4
	mov	w0, #0
	and	x8, x2, #0xffffffff
LBB6_2:
	ldr	w9, [x1], #4
	add		w0, w9, w0
	subs	x8, x8, #1
	b.ne	LBB6_2
	ret
LBB6_4:
	mov	w0, #0
	ret

This is a simple and straightforward loop that matches the goto-based IL2CPP output quite well.

Now let’s try writing the same for loop but instead calling the Length property’s get function every iteration instead of caching it in a local variable:

public static class TestClass
{
	public static int ForNoCache(NativeArray<int> nativeArray)
	{
		int sum = 0;
		for (int i = 0; i < nativeArray.Length; ++i)
		{
			sum += nativeArray[i];
		}
		return sum;
	}
}

Here’s the C++ for this:

extern "C"  int32_t TestClass_ForNoCache_m105393772 (RuntimeObject * __this /* static, unused */, NativeArray_1_t3237678471  ___nativeArray0, const RuntimeMethod* method)
{
	int32_t V_0 = 0;
	int32_t V_1 = 0;
	{
		V_0 = 0;
		V_1 = 0;
		goto IL_0018;
	}
 
IL_0009:
	{
		int32_t L_0 = V_0;
		int32_t L_1 = V_1;
		int32_t L_2 = IL2CPP_NATIVEARRAY_GET_ITEM(int32_t, ((NativeArray_1_t3237678471 *)(&___nativeArray0))->___m_Buffer_0, L_1);
		V_0 = ((int32_t)il2cpp_codegen_add((int32_t)L_0, (int32_t)L_2));
		int32_t L_3 = V_1;
		V_1 = ((int32_t)il2cpp_codegen_add((int32_t)L_3, (int32_t)1));
	}
 
IL_0018:
	{
		int32_t L_4 = V_1;
		int32_t L_5 = IL2CPP_NATIVEARRAY_GET_LENGTH(((NativeArray_1_t3237678471 *)(&___nativeArray0))->___m_Length_1);
		if ((((int32_t)L_4) < ((int32_t)L_5)))
		{
			goto IL_0009;
		}
	}
	{
		int32_t L_6 = V_0;
		return L_6;
	}
}

We can clearly see the IL2CPP_NATIVEARRAY_GET_LENGTH macro call here instead of the local variable usage in the previous version. Now for the interesting part: what assembly does this compile to?

	cmp		w2, #1
	b.lt	LBB7_4
	mov	w0, #0
	and	x8, x2, #0xffffffff
LBB7_2:
	ldr	w9, [x1], #4
	add		w0, w9, w0
	subs	x8, x8, #1
	b.ne	LBB7_2
	ret
LBB7_4:
	mov	w0, #0
	ret

The names of the labels have changed from the previous version with the cached local variable, but otherwise this is identical. A register (x8) is being used to hold the length, so we’re not performing a memory read operation in every iteration of the loop.

Conclusion: A for loop over a NativeArray<T> is optimal regardless of whether the Length is cached or not.

Foreach

Next up, let’s compare the for loop with a foreach:

public static class TestClass
{
	public static int Foreach(NativeArray<int> nativeArray)
	{
		int sum = 0;
		foreach (int cur in nativeArray)
		{
			sum += cur;
		}
		return sum;
	}
}

Here’s what IL2CPP outputs:

extern "C"  int32_t TestClass_Foreach_m301387384 (RuntimeObject * __this /* static, unused */, NativeArray_1_t3237678471  ___nativeArray0, const RuntimeMethod* method)
{
	static bool s_Il2CppMethodInitialized;
	if (!s_Il2CppMethodInitialized)
	{
		il2cpp_codegen_initialize_method (TestClass_Foreach_m301387384_MetadataUsageId);
		s_Il2CppMethodInitialized = true;
	}
	int32_t V_0 = 0;
	int32_t V_1 = 0;
	Enumerator_t4154966508  V_2;
	memset(&V_2, 0, sizeof(V_2));
	Exception_t * __last_unhandled_exception = 0;
	NO_UNUSED_WARNING (__last_unhandled_exception);
	Exception_t * __exception_local = 0;
	NO_UNUSED_WARNING (__exception_local);
	int32_t __leave_target = 0;
	NO_UNUSED_WARNING (__leave_target);
	{
		V_0 = 0;
		Enumerator_t4154966508  L_0 = NativeArray_1_GetEnumerator_m687430469((NativeArray_1_t3237678471 *)(&___nativeArray0), /*hidden argument*/NativeArray_1_GetEnumerator_m687430469_RuntimeMethod_var);
		V_2 = L_0;
	}
 
IL_000a:
	try
	{ // begin try (depth: 1)
		{
			goto IL_001b;
		}
 
IL_000f:
		{
			int32_t L_1 = Enumerator_get_Current_m2998795319((Enumerator_t4154966508 *)(&V_2), /*hidden argument*/Enumerator_get_Current_m2998795319_RuntimeMethod_var);
			V_1 = L_1;
			int32_t L_2 = V_0;
			int32_t L_3 = V_1;
			V_0 = ((int32_t)il2cpp_codegen_add((int32_t)L_2, (int32_t)L_3));
		}
 
IL_001b:
		{
			bool L_4 = Enumerator_MoveNext_m4262063796((Enumerator_t4154966508 *)(&V_2), /*hidden argument*/Enumerator_MoveNext_m4262063796_RuntimeMethod_var);
			if (L_4)
			{
				goto IL_000f;
			}
		}
 
IL_0027:
		{
			IL2CPP_LEAVE(0x3A, FINALLY_002c);
		}
	} // end try (depth: 1)
	catch(Il2CppExceptionWrapper& e)
	{
		__last_unhandled_exception = (Exception_t *)e.ex;
		goto FINALLY_002c;
	}
 
FINALLY_002c:
	{ // begin finally (depth: 1)
		Enumerator_Dispose_m1535598059((Enumerator_t4154966508 *)(&V_2), /*hidden argument*/Enumerator_Dispose_m1535598059_RuntimeMethod_var);
		IL2CPP_END_FINALLY(44)
	} // end finally (depth: 1)
	IL2CPP_CLEANUP(44)
	{
		IL2CPP_JUMP_TBL(0x3A, IL_003a)
		IL2CPP_RETHROW_IF_UNHANDLED(Exception_t *)
	}
 
IL_003a:
	{
		int32_t L_5 = V_0;
		return L_5;
	}
}

As usual, there’s a lot of exception-related code because Dispose must be called on the enumerator in a finally block. Also, all of the index code has been replaced with calls to the usual Current property and MoveNext method. Can the C++ compiler see through this and produce equivalent code to the for loop? Let’s find out by looking at the compiled assembly:

	sub	sp, sp, #128
	stp	x22, x21, [sp, #80]
	stp	x20, x19, [sp, #96]
	stp	x29, x30, [sp, #112]
	add	x29, sp, #112
	stp	x1, x2, [x29, #-48]
	adrp	x19, [email protected]AGE
	ldrb	w8, [x19, [email protected]AGEOFF]
	tbnz	w8, #0, LBB8_2
	adrp	x8, [email protected]
	ldr	x8, [x8, [email protected]]
	ldr		w0, [x8]
	bl	__ZN6il2cpp2vm13MetadataCache24InitializeMethodMetadataEj
	orr	w8, wzr, #0x1
	strb	w8, [x19, [email protected]AGEOFF]
LBB8_2:
	stp	xzr, xzr, [sp, #32]
	str	xzr, [sp, #48]
	adrp	x8, [email protected]E
	ldr	x8, [x8, [email protected]EOFF]
	ldr		x1, [x8]
	add	x8, sp, #8
	sub	x0, x29, #48
	bl	_NativeArray_1_GetEnumerator_m687430469_gshared
	mov	w19, #0
	ldr	x8, [sp, #24]
	str	x8, [sp, #48]
	ldur	q0, [sp, #8]
	str	q0, [sp, #32]
	adrp	x20, [email protected]
	ldr	x20, [x20, _Enumerator_MoveNext_m4262063796_RuntimeMethod_[email protected]]
	adrp	x21, [email protected]
	ldr	x21, [x21, [email protected]]
	b	LBB8_5
LBB8_3:
	ldr		x1, [x21]
	add	x0, sp, #32
	bl	_Enumerator_get_Current_m2998795319_gshared
	add		w19, w0, w19
LBB8_5:
	ldr		x1, [x20]
	add	x0, sp, #32
	bl	_Enumerator_MoveNext_m4262063796_gshared
	tbnz	w0, #0, LBB8_3
	mov	x20, #0
	orr	w21, wzr, #0x1
	b	LBB8_12
LBB8_8:
	b	LBB8_10
LBB8_9:
LBB8_10:
	cmp		w1, #1
	b.ne	LBB8_16
	bl	___cxa_begin_catch
	ldr		x20, [x0]
	bl	___cxa_end_catch
	mov	w21, #0
LBB8_12:
	adrp	x8, [email protected]
	ldr	x8, [x8, [email protected]]
	ldr		x1, [x8]
	add	x0, sp, #32
	bl	_Enumerator_Dispose_m1535598059_gshared
	tbnz	w21, #0, LBB8_15
	cbz	x20, LBB8_15
	mov	 x0, x20
	mov	x1, #0
	mov	x2, #0
	bl	__ZN6il2cpp2vm9Exception5RaiseEP15Il2CppExceptionP19Il2CppSequencePointP10MethodInfo
LBB8_15:
	mov	 x0, x19
	ldp	x29, x30, [sp, #112]
	ldp	x20, x19, [sp, #96]
	ldp	x22, x21, [sp, #80]
	add	sp, sp, #128
	ret

The differences between this code and the foreach are night and day. All of the code for method initialization, exceptions, and enumerators remains and the result is a big, expensive loop.

Conclusion: A foreach loop is significantly more expensive than a for loop with NativeArray<T>.

CopyFrom

Next there’s CopyFrom. There are two overloaded methods allowing copies from managed arrays and NativeArray<T>. Let’s start with the managed array version:

public static class TestClass
{
	public static void CopyFromManagedArray(
		NativeArray<int> nativeArray,
		int[] fromArray)
	{
		nativeArray.CopyFrom(fromArray);
	}
}

Here’s the C++ for this:

extern "C"  void TestClass_CopyFromManagedArray_m675868485 (RuntimeObject * __this /* static, unused */, NativeArray_1_t3237678471  ___nativeArray0, Int32U5BU5D_t385246372* ___fromArray1, const RuntimeMethod* method)
{
	static bool s_Il2CppMethodInitialized;
	if (!s_Il2CppMethodInitialized)
	{
		il2cpp_codegen_initialize_method (TestClass_CopyFromManagedArray_m675868485_MetadataUsageId);
		s_Il2CppMethodInitialized = true;
	}
	{
		Int32U5BU5D_t385246372* L_0 = ___fromArray1;
		NativeArray_1_CopyFrom_m892013632((NativeArray_1_t3237678471 *)(&___nativeArray0), L_0, /*hidden argument*/NativeArray_1_CopyFrom_m892013632_RuntimeMethod_var);
		return;
	}
}

This is really just a call to CopyFrom, so let’s go look at that:

#define NativeArray_1_CopyFrom_m892013632(__this, p0, method) ((  void (*) (NativeArray_1_t3237678471 *, Int32U5BU5D_t385246372*, const RuntimeMethod*))NativeArray_1_CopyFrom_m892013632_gshared)(__this, p0, method)

It turns out this is a macro that calls through a function pointer to the real function:

extern "C"  void NativeArray_1_CopyFrom_m892013632_gshared (NativeArray_1_t3237678471 * __this, Int32U5BU5D_t385246372* ___array0, const RuntimeMethod* method)
{
	int32_t V_0 = 0;
	{
		V_0 = (int32_t)0;
		goto IL_001f;
	}
 
IL_0008:
	{
		void* L_0 = (void*)__this->get_m_Buffer_0();
		int32_t L_1 = V_0;
		Int32U5BU5D_t385246372* L_2 = ___array0;
		int32_t L_3 = V_0;
		NullCheck(L_2);
		int32_t L_4 = L_3;
		int32_t L_5 = (L_2)->GetAt(static_cast<il2cpp_array_size_t>(L_4));
		((  void (*) (RuntimeObject * /* static, unused */, void*, int32_t, int32_t, const RuntimeMethod*))IL2CPP_RGCTX_METHOD_INFO(InitializedTypeInfo(method->klass)->rgctx_data, 7)->methodPointer)(NULL /*static, unused*/, (void*)(void*)L_0, (int32_t)L_1, (int32_t)L_5, /*hidden argument*/IL2CPP_RGCTX_METHOD_INFO(InitializedTypeInfo(method->klass)->rgctx_data, 7));
		int32_t L_6 = V_0;
		V_0 = (int32_t)((int32_t)il2cpp_codegen_add((int32_t)L_6, (int32_t)1));
	}
 
IL_001f:
	{
		int32_t L_7 = V_0;
		int32_t L_8 = IL2CPP_NATIVEARRAY_GET_LENGTH(((NativeArray_1_t3237678471 *)(NativeArray_1_t3237678471 *)__this)->___m_Length_1);
		if ((((int32_t)L_7) < ((int32_t)L_8)))
		{
			goto IL_0008;
		}
	}
	{
		return;
	}
}

This is essentially just a loop over the array that calls the static UnsafeUtility.WriteArrayElement on each element via another really long line of code. To get the element to write, there’s a NullCheck and a bounds-checked GetAt call. Let’s see how that’s implemented:

extern "C"  void UnsafeUtility_WriteArrayElement_TisInt32_t2950945753_m3798695422_gshared (RuntimeObject * __this /* static, unused */, void* ___destination0, int32_t ___index1, int32_t ___value2, const RuntimeMethod* method)
{
	{
		void* L_0 = ___destination0;
		int32_t L_1 = ___index1;
		uint32_t L_2 = sizeof(int32_t);
		int32_t L_3 = ___value2;
		*(int32_t*)((void*)il2cpp_codegen_add((intptr_t)L_0, (int32_t)((int32_t)il2cpp_codegen_multiply((int32_t)L_1, (int32_t)(((int32_t)((int32_t)L_2))))))) = L_3;
		return;
	}
}

This is just setting an element by writing into an offset from the start of the unmanaged array.

Now let’s compare by looking at the overload of CopyFrom that copies from a NativeArray<T>:

public static class TestClass
{
	public static void CopyFromNativeArray(
		NativeArray<int> nativeArray,
		NativeArray<int> fromArray)
	{
		nativeArray.CopyFrom(fromArray);
	}
}

Here’s what IL2CPP outputs:

extern "C"  void TestClass_CopyFromNativeArray_m1592963316 (RuntimeObject * __this /* static, unused */, NativeArray_1_t3237678471  ___nativeArray0, NativeArray_1_t3237678471  ___fromArray1, const RuntimeMethod* method)
{
	static bool s_Il2CppMethodInitialized;
	if (!s_Il2CppMethodInitialized)
	{
		il2cpp_codegen_initialize_method (TestClass_CopyFromNativeArray_m1592963316_MetadataUsageId);
		s_Il2CppMethodInitialized = true;
	}
	{
		NativeArray_1_t3237678471  L_0 = ___fromArray1;
		NativeArray_1_CopyFrom_m2953825274((NativeArray_1_t3237678471 *)(&___nativeArray0), L_0, /*hidden argument*/NativeArray_1_CopyFrom_m2953825274_RuntimeMethod_var);
		return;
	}
}

This is just another call to CopyFrom, so let’s look at that:

#define NativeArray_1_CopyFrom_m2953825274(__this, p0, method) ((  void (*) (NativeArray_1_t3237678471 *, NativeArray_1_t3237678471 , const RuntimeMethod*))NativeArray_1_CopyFrom_m2953825274_gshared)(__this, p0, method)

It turns out this is also a macro, so let’s look at the function it calls:

extern "C"  void NativeArray_1_CopyFrom_m2953825274_gshared (NativeArray_1_t3237678471 * __this, NativeArray_1_t3237678471  ___array0, const RuntimeMethod* method)
{
	{
		NativeArray_1_CopyTo_m966487794((NativeArray_1_t3237678471 *)(NativeArray_1_t3237678471 *)(&___array0), (NativeArray_1_t3237678471 )(*(NativeArray_1_t3237678471 *)__this), /*hidden argument*/IL2CPP_RGCTX_METHOD_INFO(InitializedTypeInfo(method->klass)->rgctx_data, 8));
		return;
	}
}

CopyFrom actually calls CopyTo on the other NativeArray<T>. Let’s look at that:

#define NativeArray_1_CopyTo_m966487794(__this, p0, method) ((  void (*) (NativeArray_1_t3237678471 *, NativeArray_1_t3237678471 , const RuntimeMethod*))NativeArray_1_CopyTo_m966487794_gshared)(__this, p0, method)
 
extern "C"  void NativeArray_1_CopyTo_m966487794_gshared (NativeArray_1_t3237678471 * __this, NativeArray_1_t3237678471  ___array0, const RuntimeMethod* method)
{
	{
		void* L_0 = (void*)(&___array0)->get_m_Buffer_0();
		void* L_1 = (void*)__this->get_m_Buffer_0();
		int32_t L_2 = IL2CPP_NATIVEARRAY_GET_LENGTH(((NativeArray_1_t3237678471 *)(NativeArray_1_t3237678471 *)__this)->___m_Length_1);
		int32_t L_3 = ((  int32_t (*) (RuntimeObject * /* static, unused */, const RuntimeMethod*))IL2CPP_RGCTX_METHOD_INFO(InitializedTypeInfo(method->klass)->rgctx_data, 3)->methodPointer)(NULL /*static, unused*/, /*hidden argument*/IL2CPP_RGCTX_METHOD_INFO(InitializedTypeInfo(method->klass)->rgctx_data, 3));
		UnsafeUtility_MemCpy_m1650311498(NULL /*static, unused*/, (void*)(void*)L_0, (void*)(void*)L_1, (int64_t)((int64_t)il2cpp_codegen_multiply((int64_t)(((int64_t)((int64_t)L_2))), (int64_t)(((int64_t)((int64_t)L_3))))), /*hidden argument*/NULL);
		return;
	}
}

This is another macro that calls a function, but now we’ve found the function doing the work. It gets the Length via the usual macro, calls UnsafeUtility.SizeOf<T>, and then calls UnsafeUtility.MemCpy to copy the contents of the unmanaged array into the unmanaged array of the other NativeArray<T>. Next, let’s check out MemCpy:

extern "C"  void UnsafeUtility_MemCpy_m1650311498 (RuntimeObject * __this /* static, unused */, void* ___destination0, void* ___source1, int64_t ___size2, const RuntimeMethod* method)
{
	typedef void (*UnsafeUtility_MemCpy_m1650311498_ftn) (void*, void*, int64_t);
	static UnsafeUtility_MemCpy_m1650311498_ftn _il2cpp_icall_func;
	if (!_il2cpp_icall_func)
	_il2cpp_icall_func = (UnsafeUtility_MemCpy_m1650311498_ftn)il2cpp_codegen_resolve_icall ("Unity.Collections.LowLevel.Unsafe.UnsafeUtility::MemCpy(System.Void*,System.Void*,System.Int64)");
	_il2cpp_icall_func(___destination0, ___source1, ___size2);
}

This is another call into the Unity engine, so the trail ends here.

Conclusion: Using CopyFrom with a managed array suffers from null and bounds checks in its inner loop. Consider using the custom version at the end of the article instead. When using CopyFrom with a NativeArray<T>, a presumably efficient MemCpy is used instead.

CopyTo

We’ve already seen one overload of CopyTo that copies to another NativeArray<T>. Now let’s look at the version that copies to a managed array:

public static class TestClass
{
	public static void CopyToManagedArray(
		NativeArray<int> nativeArray,
		int[] toArray)
	{
		nativeArray.CopyTo(toArray);
	}
}

Here’s what IL2CPP generates for this function:

extern "C"  void TestClass_CopyToManagedArray_m1995515236 (RuntimeObject * __this /* static, unused */, NativeArray_1_t3237678471  ___nativeArray0, Int32U5BU5D_t385246372* ___toArray1, const RuntimeMethod* method)
{
	static bool s_Il2CppMethodInitialized;
	if (!s_Il2CppMethodInitialized)
	{
		il2cpp_codegen_initialize_method (TestClass_CopyToManagedArray_m1995515236_MetadataUsageId);
		s_Il2CppMethodInitialized = true;
	}
	{
		Int32U5BU5D_t385246372* L_0 = ___toArray1;
		NativeArray_1_CopyTo_m1784392623((NativeArray_1_t3237678471 *)(&___nativeArray0), L_0, /*hidden argument*/NativeArray_1_CopyTo_m1784392623_RuntimeMethod_var);
		return;
	}
}

Here we get the method initialization for using generics and a call to CopyTo:

#define NativeArray_1_CopyTo_m1784392623(__this, p0, method) ((  void (*) (NativeArray_1_t3237678471 *, Int32U5BU5D_t385246372*, const RuntimeMethod*))NativeArray_1_CopyTo_m1784392623_gshared)(__this, p0, method)
 
extern "C"  void NativeArray_1_CopyTo_m1784392623_gshared (NativeArray_1_t3237678471 * __this, Int32U5BU5D_t385246372* ___array0, const RuntimeMethod* method)
{
	int32_t V_0 = 0;
	{
		V_0 = (int32_t)0;
		goto IL_001f;
	}
 
IL_0008:
	{
		Int32U5BU5D_t385246372* L_0 = ___array0;
		int32_t L_1 = V_0;
		void* L_2 = (void*)__this->get_m_Buffer_0();
		int32_t L_3 = V_0;
		int32_t L_4 = ((  int32_t (*) (RuntimeObject * /* static, unused */, void*, int32_t, const RuntimeMethod*))IL2CPP_RGCTX_METHOD_INFO(InitializedTypeInfo(method->klass)->rgctx_data, 6)->methodPointer)(NULL /*static, unused*/, (void*)(void*)L_2, (int32_t)L_3, /*hidden argument*/IL2CPP_RGCTX_METHOD_INFO(InitializedTypeInfo(method->klass)->rgctx_data, 6));
		NullCheck(L_0);
		(L_0)->SetAt(static_cast<il2cpp_array_size_t>(L_1), (int32_t)L_4);
		int32_t L_5 = V_0;
		V_0 = (int32_t)((int32_t)il2cpp_codegen_add((int32_t)L_5, (int32_t)1));
	}
 
IL_001f:
	{
		int32_t L_6 = V_0;
		int32_t L_7 = IL2CPP_NATIVEARRAY_GET_LENGTH(((NativeArray_1_t3237678471 *)(NativeArray_1_t3237678471 *)__this)->___m_Length_1);
		if ((((int32_t)L_6) < ((int32_t)L_7)))
		{
			goto IL_0008;
		}
	}
	{
		return;
	}
}

CopyTo was a macro calling the real function, like in other cases. This function looks very similar to the CopyFrom overload for managed arrays. It gets the Length of the NativeArray<T> and performs a loop over it. At each iteration, UnsafeUtility.ReadArrayElement is called to get the element of the NativeArray<T> and then it is set to the managed array element. Just like with CopyFrom, null and bounds checks are still being used in the forms of NullCheck and SetAt.

Conclusion: We’ve already seen CopyTo a NativeArray<T>, which is presumably implemented efficiently in the Unity engine via a MemCpy. The managed array overload of CopyTo is just like the managed array overload of CopyFrom. It unfortunately uses null and bounds checks at each iteration of the loop. Consider using the custom version at the end of the article instead.

Dispose

Finally, we have Dispose which is the way to manually free the unmanaged memory used by the NativeArray<T>.

public static class TestClass
{
	public static void Dispose(NativeArray<int> nativeArray)
	{
		nativeArray.Dispose();
	}
}

Here’s the IL2CPP output:

extern "C"  void TestClass_Dispose_m3319844732 (RuntimeObject * __this /* static, unused */, NativeArray_1_t3237678471  ___nativeArray0, const RuntimeMethod* method)
{
	static bool s_Il2CppMethodInitialized;
	if (!s_Il2CppMethodInitialized)
	{
		il2cpp_codegen_initialize_method (TestClass_Dispose_m3319844732_MetadataUsageId);
		s_Il2CppMethodInitialized = true;
	}
	{
		NativeArray_1_Dispose_m869946129((NativeArray_1_t3237678471 *)(&___nativeArray0), /*hidden argument*/NativeArray_1_Dispose_m869946129_RuntimeMethod_var);
		return;
	}
}

This is basically just a call to Dispose. Notice that this has been devirtualized so there’s no virtual function call to IDisposable.Dispose but instead a direct, non-virtual call to NativeArray<T>.Dispose. Let’s go look at that:

#define NativeArray_1_Dispose_m869946129(__this, method) ((  void (*) (NativeArray_1_t3237678471 *, const RuntimeMethod*))NativeArray_1_Dispose_m869946129_gshared)(__this, method)
 
extern "C"  void NativeArray_1_Dispose_m869946129_gshared (NativeArray_1_t3237678471 * __this, const RuntimeMethod* method)
{
	{
		void* L_0 = (void*)__this->get_m_Buffer_0();
		int32_t L_1 = (int32_t)__this->get_m_AllocatorLabel_2();
		UnsafeUtility_Free_m3354695133(NULL /*static, unused*/, (void*)(void*)L_0, (int32_t)L_1, /*hidden argument*/NULL);
		__this->set_m_Buffer_0((void*)(((uintptr_t)0)));
		__this->set_m_Length_1(0);
		return;
	}
}

This mostly just calls into UnsafeUtility.Free, sets the buffer to null, and sets the length to zero. Here’s Free:

extern "C"  void UnsafeUtility_Free_m3354695133 (RuntimeObject * __this /* static, unused */, void* ___memory0, int32_t ___allocator1, const RuntimeMethod* method)
{
	typedef void (*UnsafeUtility_Free_m3354695133_ftn) (void*, int32_t);
	static UnsafeUtility_Free_m3354695133_ftn _il2cpp_icall_func;
	if (!_il2cpp_icall_func)
	_il2cpp_icall_func = (UnsafeUtility_Free_m3354695133_ftn)il2cpp_codegen_resolve_icall ("Unity.Collections.LowLevel.Unsafe.UnsafeUtility::Free(System.Void*,Unity.Collections.Allocator)");
	_il2cpp_icall_func(___memory0, ___allocator1);
}

This is a call into the Unity engine, so we can’t see how this works.

Conclusion: Dispose is implemented in a very straightforward way, mostly internally in the Unity engine.

Other NativeArray Contents

NativeArray<T> has a few other contents as seen in Unity’s open source. These are mostly just calls to the functions described in this article, so it’s easy to understand how they’ll work. For example, there’s a constructor that takes a managed array, but it just calls CopyFrom. To see more about how these are implemented, follow this guide.

Conclusion

NativeArray<T> is generally implemented very well, at least in the parts of it we can see. Critically, the special-case macros for getting Length and reading and writing via the indexer dramatically improve the final assembly code to the point where it’s completely optimal. On the downside, the managed array overloads of CopyFrom and CopyTo are slowed down dramatically by unnecessary null and bounds checks. See the code below for faster versions of them:

CopyFromFast and CopyToFast

The following extension functions provide CopyFromFast and CopyToFast for NativeArray<T>. Simply place this code into a Unity project and these functions will seem to become part of the NativeArray<T> API. That means we’ll be able to call myNativeArray.CopyFromFast(myManagedArray). These functions still perform error checking for null managed arrays and managed arrays that are too short, but they do it only once outside of the loop instead of in every iteration.

using System;
using Unity.Collections;
using Unity.Collections.LowLevel.Unsafe;
using Unity.IL2CPP.CompilerServices;
 
/// <summary>
/// Extension methods to <see cref="NativeArray{T}"/>
/// </summary>
/// <author>
/// Jackson Dunstan, https://jacksondunstan.com/articles/4713
/// </author>
public static class NativeArrayExtensions
{
	/// <summary>
	/// A faster version of <see cref="NativeArray{T}.CopyFrom(T[])"/>
	/// </summary>
	/// 
	/// <param name="nativeArray">
	/// <see cref="NativeArray{T}"/> to copy from
	/// </param>
	///
	/// <param name="array">
	/// Managed array to copy to
	/// </param>
	///
	/// <typeparam name="T">
	/// Type of elements in the <see cref="NativeArray{T}"/> and managed array
	/// </typeparam>
	///
	/// <exception cref="NullReferenceException">
	/// Thrown if the managed array is null
	/// </exception>
	///
	/// <exception cref="IndexOutOfRangeException">
	/// Thrown if the managed array is shorter than the
	/// <see cref="NativeArray{T}"/>
	/// </exception>
	[Il2CppSetOption(Option.NullChecks, false)]
	[Il2CppSetOption(Option.ArrayBoundsChecks, false)]
	public unsafe static void CopyFromFast<T>(
		this NativeArray<T> nativeArray,
		T[] array)
		where T : struct
	{
		if (array == null)
		{
			throw new NullReferenceException(nameof(array) + " is null");
		}
 
		int nativeArrayLength = nativeArray.Length;
		if (array.Length < nativeArrayLength)
		{
			throw new IndexOutOfRangeException(
				nameof(array) + " is shorter than " + nameof(nativeArray));
		}
		void* buffer = nativeArray.GetUnsafePtr();
		for (int i = 0; i < nativeArrayLength; ++i)
		{
			UnsafeUtility.WriteArrayElement(buffer, i, array[i]);
		}
	}
 
	/// <summary>
	/// A faster version of <see cref="NativeArray{T}.CopyTo(T[])"/>
	/// </summary>
	/// 
	/// <param name="nativeArray">
	/// <see cref="NativeArray{T}"/> to copy to
	/// </param>
	///
	/// <param name="array">
	/// Managed array to copy from
	/// </param>
	///
	/// <typeparam name="T">
	/// Type of elements in the <see cref="NativeArray{T}"/> and managed array
	/// </typeparam>
	///
	/// <exception cref="NullReferenceException">
	/// Thrown if the managed array is null
	/// </exception>
	///
	/// <exception cref="IndexOutOfRangeException">
	/// Thrown if the managed array is shorter than the
	/// <see cref="NativeArray{T}"/>
	/// </exception>
	[Il2CppSetOption(Option.NullChecks, false)]
	[Il2CppSetOption(Option.ArrayBoundsChecks, false)]
	public unsafe static void CopyToFast<T>(
		this NativeArray<T> nativeArray,
		T[] array)
		where T : struct
	{
		if (array == null)
		{
			throw new NullReferenceException(nameof(array) + " is null");
		}
 
		int nativeArrayLength = nativeArray.Length;
		if (array.Length < nativeArrayLength)
		{
			throw new IndexOutOfRangeException(
				nameof(array) + " is shorter than " + nameof(nativeArray));
		}
		void* buffer = nativeArray.GetUnsafePtr();
		for (int i = 0; i < nativeArrayLength; ++i)
		{
			array[i] = UnsafeUtility.ReadArrayElement<T>(buffer, i);
		}
	}
}