The following script shows you steps to
- download the python source code
- compile it
- integrate it into pyenv
- and then use it
on an Apple Silicone Mac.
# prepare dependencies
brew install openssl readline zlib xz
# checkout python
git clone https://github.com/python/cpython.git
mkdir $HOME/.pythons
cd cpython
git checkout v3.13.3
# build
./configure \
--prefix=$HOME/.pythons/python-3.13.3-nogil \
--disable-gil \
--enable-optimizations \
CPPFLAGS="-I$(brew --prefix)/opt/openssl/include" \
LDFLAGS="-L$(brew --prefix)/opt/openssl/lib"
make -j$(sysctl -n hw.ncpu)
make install
# integrate into pyenv
mkdir -p ~/.pyenv/versions/3.13.3-nogil
ln -s $HOME/.pythons/python-3.13.3-nogil ~/.pyenv/versions/3.13.3-nogil
# verify installation
pyenv versions | grep 3.13.3
pyenv shell 3.13.3-nogil
python3 --version
```
I’ve also tested it with a script that allows to run with different numbers of parallel threads:
import sys
import sysconfig
import math
import time
import threading
import argparse
def compute_factorial(n: int) -> None:
"""Compute the factorial of a single number (no return, just for load)."""
math.factorial(n)
def multi_threaded_compute(numbers: list[int], thread_count: int) -> None:
"""Distribute factorial computations across multiple threads."""
threads = []
# Split the workload into chunks for each thread
chunks = [numbers[i::thread_count] for i in range(thread_count)]
def worker(sublist: list[int]) -> None:
for num in sublist:
compute_factorial(num)
# Start threads
for sublist in chunks:
thread = threading.Thread(target=worker, args=(sublist,))
threads.append(thread)
thread.start()
# Wait for all threads to finish
for thread in threads:
thread.join()
print("All factorials computed.")
def check_gil_status() -> None:
"""Print whether the GIL is active or disabled."""
gil_status = sysconfig.get_config_var("Py_GIL_DISABLED")
if gil_status is None:
print("GIL status: Unknown or not supported on this build")
elif gil_status == 0:
print("GIL is active")
elif gil_status == 1:
print("GIL is disabled")
def parse_args() -> int:
"""Parse and return the number of threads from command-line args."""
parser = argparse.ArgumentParser(description="Multithreaded factorial computation")
parser.add_argument(
"threads",
type=int,
help="Number of threads to use (must be a positive integer)",
)
args = parser.parse_args()
if args.threads <= 0:
parser.error("Thread count must be a positive integer")
return args.threads
def main():
print(f"Python version: {sys.version}")
check_gil_status()
thread_count = parse_args()
numbers = [100_000, 200_000, 300_000, 400_000, 500_000]
start_time = time.time()
multi_threaded_compute(numbers, thread_count)
elapsed_time = time.time() - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")
if __name__ == "__main__":
main()
Run on locally built python with GIL disabled:
% time python3 test.py $(sysctl -n hw.ncpu)
Python version: 3.13.3 experimental free-threading build (tags/v3.13.3:6280bb54784, Apr 11 2025, 13:01:07) [Clang 17.0.0 (clang-1700.0.13.3)]
GIL is disabled
All factorials computed.
Time taken: 2.47 seconds
python3 test.py $(sysctl -n hw.ncpu) 6.09s user 0.04s system 235% cpu 2.608 total
Run with same python version from brew with GIL enabled:
% time python3 test.py $(sysctl -n hw.ncpu)
Python version: 3.13.3 (main, Apr 8 2025, 13:54:08) [Clang 16.0.0 (clang-1600.0.26.6)]
GIL is active
All factorials computed.
Time taken: 6.26 seconds
python3 test.py $(sysctl -n hw.ncpu) 6.26s user 0.04s system 99% cpu 6.360 total
Disabling GIL brings runtime down to 2.47 seconds instead of 6.26. Disabling the GIL made the program about 60% faster 🚀
Check the cpu usage! It went up to 235% instead of 99%.
Here a small comparison to run the same in Kotlin with focus on multithreading:
import java.math.BigInteger
import kotlin.system.exitProcess
import kotlin.system.measureTimeMillis
import java.util.concurrent.Executors
fun computeFactorial(n: Int): BigInteger {
var result = BigInteger.ONE
for (i in 2..n) {
result = result.multiply(BigInteger.valueOf(i.toLong()))
}
return result
}
fun multiThreadedCompute(numbers: List<Int>, threadCount: Int) {
val executor = Executors.newFixedThreadPool(threadCount)
val tasks = numbers.map { n ->
Runnable { computeFactorial(n) }
}
tasks.forEach { executor.submit(it) }
executor.shutdown()
while (!executor.isTerminated) {
Thread.sleep(50)
}
println("All factorials computed.")
}
fun main(args: Array<String>) {
if (args.isEmpty()) {
println("Usage: kotlin FactorialBenchmark.kt <thread_count>")
exitProcess(1)
}
val threadCount = args[0].toIntOrNull()
if (threadCount == null || threadCount <= 0) {
println("Please provide a valid positive integer for thread count.")
exitProcess(1)
}
println("Kotlin version: ${KotlinVersion.CURRENT}")
println("Running with $threadCount threads")
val numbers = listOf(100_000, 200_000, 300_000, 400_000, 500_000)
val elapsed = measureTimeMillis {
multiThreadedCompute(numbers, threadCount)
}
println("Time taken: ${"%.2f".format(elapsed / 1000.0)} seconds")
}
Result for the kotlin program running within JVM:
% time java -jar benchmark.jar $(sysctl -n hw.ncpu)
Kotlin version: 2.1.20
Running with 10 threads
All factorials computed.
Time taken: 92,55 seconds
java -jar benchmark.jar $(sysctl -n hw.ncpu) 199.58s user 1.49s system 217% cpu 1:32.65 total
So python is killing Kotlin in this use case. With or without GIL.
For completeness I also tried it with Java 21. It was nearly the same timing. So in this case it didn’t matter if the code was Kotlin or Java.
Interestingly even the go variant didn’t outperform python here:
% time go run main.go $(sysctl -n hw.ncpu)
Go version: go1.24.2
GIL is not applicable in Go (true concurrency with goroutines and OS threads)
All factorials computed.
Time taken: 16.26 seconds
go run main.go $(sysctl -n hw.ncpu) 32.77s user 3.37s system 217% cpu 16.626 total
package main
import (
"fmt"
"math/big"
"os"
"runtime"
"strconv"
"time"
)
func computeFactorial(n int) *big.Int {
result := big.NewInt(1)
tmp := big.NewInt(1)
for i := 2; i <= n; i++ {
tmp.SetInt64(int64(i))
result.Mul(result, tmp)
}
return result
}
func worker(jobs <-chan int, done chan<- bool) {
for num := range jobs {
computeFactorial(num)
}
done <- true
}
func multiThreadedCompute(numbers []int, threadCount int) {
jobs := make(chan int, len(numbers))
done := make(chan bool)
// Start fixed number of workers
for i := 0; i < threadCount; i++ {
go worker(jobs, done)
}
// Send jobs
for _, num := range numbers {
jobs <- num
}
close(jobs)
// Wait for all workers to finish
for i := 0; i < threadCount; i++ {
<-done
}
fmt.Println("All factorials computed.")
}
func checkGILStatus() {
fmt.Println("GIL is not applicable in Go (true concurrency with goroutines and OS threads)")
}
func main() {
fmt.Println("Go version:", runtime.Version())
checkGILStatus()
if len(os.Args) != 2 {
fmt.Println("Usage: go run main.go <thread_count>")
os.Exit(1)
}
threadCount, err := strconv.Atoi(os.Args[1])
if err != nil || threadCount <= 0 {
fmt.Println("Invalid thread count: must be a positive integer")
os.Exit(1)
}
numbers := []int{100_000, 200_000, 300_000, 400_000, 500_000}
start := time.Now()
multiThreadedCompute(numbers, threadCount)
elapsed := time.Since(start)
fmt.Printf("Time taken: %.2f seconds\n", elapsed.Seconds())
}