Install Python 3.13 without GIL on Apple Silicone

The following script shows you steps to

  • download the python source code
  • compile it
  • integrate it into pyenv
  • and then use it

on an Apple Silicone Mac.

# prepare dependencies
brew install openssl readline zlib xz

# checkout python
git clone https://github.com/python/cpython.git
mkdir $HOME/.pythons
cd cpython
git checkout v3.13.3

# build
./configure \
  --prefix=$HOME/.pythons/python-3.13.3-nogil \
  --disable-gil \
  --enable-optimizations \
  CPPFLAGS="-I$(brew --prefix)/opt/openssl/include" \
  LDFLAGS="-L$(brew --prefix)/opt/openssl/lib"
make -j$(sysctl -n hw.ncpu)
make install

# integrate into pyenv
mkdir -p ~/.pyenv/versions/3.13.3-nogil
ln -s $HOME/.pythons/python-3.13.3-nogil ~/.pyenv/versions/3.13.3-nogil
# verify installation
pyenv versions | grep 3.13.3

pyenv shell 3.13.3-nogil
python3 --version
```

I’ve also tested it with a script that allows to run with different numbers of parallel threads:

import sys
import sysconfig
import math
import time
import threading
import argparse


def compute_factorial(n: int) -> None:
    """Compute the factorial of a single number (no return, just for load)."""
    math.factorial(n)


def multi_threaded_compute(numbers: list[int], thread_count: int) -> None:
    """Distribute factorial computations across multiple threads."""
    threads = []
    # Split the workload into chunks for each thread
    chunks = [numbers[i::thread_count] for i in range(thread_count)]

    def worker(sublist: list[int]) -> None:
        for num in sublist:
            compute_factorial(num)

    # Start threads
    for sublist in chunks:
        thread = threading.Thread(target=worker, args=(sublist,))
        threads.append(thread)
        thread.start()

    # Wait for all threads to finish
    for thread in threads:
        thread.join()

    print("All factorials computed.")


def check_gil_status() -> None:
    """Print whether the GIL is active or disabled."""
    gil_status = sysconfig.get_config_var("Py_GIL_DISABLED")
    if gil_status is None:
        print("GIL status: Unknown or not supported on this build")
    elif gil_status == 0:
        print("GIL is active")
    elif gil_status == 1:
        print("GIL is disabled")


def parse_args() -> int:
    """Parse and return the number of threads from command-line args."""
    parser = argparse.ArgumentParser(description="Multithreaded factorial computation")
    parser.add_argument(
        "threads",
        type=int,
        help="Number of threads to use (must be a positive integer)",
    )
    args = parser.parse_args()

    if args.threads <= 0:
        parser.error("Thread count must be a positive integer")

    return args.threads


def main():
    print(f"Python version: {sys.version}")
    check_gil_status()

    thread_count = parse_args()
    numbers = [100_000, 200_000, 300_000, 400_000, 500_000]

    start_time = time.time()
    multi_threaded_compute(numbers, thread_count)
    elapsed_time = time.time() - start_time

    print(f"Time taken: {elapsed_time:.2f} seconds")


if __name__ == "__main__":
    main()

Run on locally built python with GIL disabled:

% time python3 test.py $(sysctl -n hw.ncpu)

Python version: 3.13.3 experimental free-threading build (tags/v3.13.3:6280bb54784, Apr 11 2025, 13:01:07) [Clang 17.0.0 (clang-1700.0.13.3)]
GIL is disabled
All factorials computed.
Time taken: 2.47 seconds
python3 test.py $(sysctl -n hw.ncpu)  6.09s user 0.04s system 235% cpu 2.608 total

Run with same python version from brew with GIL enabled:

% time python3 test.py $(sysctl -n hw.ncpu)

Python version: 3.13.3 (main, Apr  8 2025, 13:54:08) [Clang 16.0.0 (clang-1600.0.26.6)]
GIL is active
All factorials computed.
Time taken: 6.26 seconds
python3 test.py $(sysctl -n hw.ncpu)  6.26s user 0.04s system 99% cpu 6.360 total

Disabling GIL brings runtime down to 2.47 seconds instead of 6.26. Disabling the GIL made the program about 60% faster 🚀

Check the cpu usage! It went up to 235% instead of 99%.

Here a small comparison to run the same in Kotlin with focus on multithreading:

import java.math.BigInteger
import kotlin.system.exitProcess
import kotlin.system.measureTimeMillis
import java.util.concurrent.Executors

fun computeFactorial(n: Int): BigInteger {
    var result = BigInteger.ONE
    for (i in 2..n) {
        result = result.multiply(BigInteger.valueOf(i.toLong()))
    }
    return result
}

fun multiThreadedCompute(numbers: List<Int>, threadCount: Int) {
    val executor = Executors.newFixedThreadPool(threadCount)

    val tasks = numbers.map { n ->
        Runnable { computeFactorial(n) }
    }

    tasks.forEach { executor.submit(it) }

    executor.shutdown()
    while (!executor.isTerminated) {
        Thread.sleep(50)
    }

    println("All factorials computed.")
}

fun main(args: Array<String>) {
    if (args.isEmpty()) {
        println("Usage: kotlin FactorialBenchmark.kt <thread_count>")
        exitProcess(1)
    }

    val threadCount = args[0].toIntOrNull()
    if (threadCount == null || threadCount <= 0) {
        println("Please provide a valid positive integer for thread count.")
        exitProcess(1)
    }

    println("Kotlin version: ${KotlinVersion.CURRENT}")
    println("Running with $threadCount threads")

    val numbers = listOf(100_000, 200_000, 300_000, 400_000, 500_000)

    val elapsed = measureTimeMillis {
        multiThreadedCompute(numbers, threadCount)
    }

    println("Time taken: ${"%.2f".format(elapsed / 1000.0)} seconds")
}

Result for the kotlin program running within JVM:

% time java -jar benchmark.jar $(sysctl -n hw.ncpu)
Kotlin version: 2.1.20
Running with 10 threads
All factorials computed.
Time taken: 92,55 seconds
java -jar benchmark.jar $(sysctl -n hw.ncpu)  199.58s user 1.49s system 217% cpu 1:32.65 total

So python is killing Kotlin in this use case. With or without GIL.

For completeness I also tried it with Java 21. It was nearly the same timing. So in this case it didn’t matter if the code was Kotlin or Java.

Interestingly even the go variant didn’t outperform python here:

% time go run main.go $(sysctl -n hw.ncpu)
Go version: go1.24.2
GIL is not applicable in Go (true concurrency with goroutines and OS threads)
All factorials computed.
Time taken: 16.26 seconds
go run main.go $(sysctl -n hw.ncpu)  32.77s user 3.37s system 217% cpu 16.626 total
package main

import (
	"fmt"
	"math/big"
	"os"
	"runtime"
	"strconv"
	"time"
)

func computeFactorial(n int) *big.Int {
	result := big.NewInt(1)
	tmp := big.NewInt(1)
	for i := 2; i <= n; i++ {
		tmp.SetInt64(int64(i))
		result.Mul(result, tmp)
	}
	return result
}

func worker(jobs <-chan int, done chan<- bool) {
	for num := range jobs {
		computeFactorial(num)
	}
	done <- true
}

func multiThreadedCompute(numbers []int, threadCount int) {
	jobs := make(chan int, len(numbers))
	done := make(chan bool)

	// Start fixed number of workers
	for i := 0; i < threadCount; i++ {
		go worker(jobs, done)
	}

	// Send jobs
	for _, num := range numbers {
		jobs <- num
	}
	close(jobs)

	// Wait for all workers to finish
	for i := 0; i < threadCount; i++ {
		<-done
	}

	fmt.Println("All factorials computed.")
}

func checkGILStatus() {
	fmt.Println("GIL is not applicable in Go (true concurrency with goroutines and OS threads)")
}

func main() {
	fmt.Println("Go version:", runtime.Version())
	checkGILStatus()

	if len(os.Args) != 2 {
		fmt.Println("Usage: go run main.go <thread_count>")
		os.Exit(1)
	}

	threadCount, err := strconv.Atoi(os.Args[1])
	if err != nil || threadCount <= 0 {
		fmt.Println("Invalid thread count: must be a positive integer")
		os.Exit(1)
	}

	numbers := []int{100_000, 200_000, 300_000, 400_000, 500_000}

	start := time.Now()
	multiThreadedCompute(numbers, threadCount)
	elapsed := time.Since(start)

	fmt.Printf("Time taken: %.2f seconds\n", elapsed.Seconds())
}