\(\)
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=200, n_jobs=-1)
clf.fit(X, y)
also look at dask-ml
from sklearn.ensemble import RandomForestClassifier
import joblib
from dask.distributed import Client
client = Client('scheduler-address')
clf = RandomForestClassifier(n_estimators=200, n_jobs=-1)
with joblib.parallel_backend("dask", scatter=[X, y]):
clf.fit(X, y)
also look at dask-ml
from dask_jobqueue import SLURMCluster
# Each job will use one core and 4GB
cluster = SLURMCluster(cores=1, memory='4GB', **cluster_specific_kwargs)
cluster.scale(jobs=4) # launch 4 jobs
from dask.distributed import Client
client = Client(cluster)
# user code
The tricky bit is cluster_specific_kwargs
from dask_jobqueue import SGECluster
resource_spec = 'h_vmem=10G,mem_req=1G'
queue = 'gaia.q'
cluster = SGECluster(queue=queue,
cores=1, processes=1,
memory='16GB',
resource_spec=resource_spec,
interface='ib0')
In [5]: print(cluster.job_script())
#!/bin/bash
#!/usr/bin/env bash
#SBATCH -J dask-worker
#SBATCH -n 1
#SBATCH --cpus-per-task=1
#SBATCH --mem=4G
#SBATCH -t 00:30:00
JOB_ID=${SLURM_JOB_ID%;*}
python -m distributed.cli.dask_worker tcp://192.168.0.11:43725 --nthreads 1\
--memory-limit 4.00GB --name dask-worker--${JOB_ID}-- --death-timeout 60
contribute your favourite one if not there already!
i.e. the script where the Dask cluster is created
personal opinion