UtilBase¶
- class paddle.distributed.fleet. UtilBase [source]
-
-
all_reduce
(
input,
mode='sum',
comm_world='worker'
)
all_reduce¶
-
All reduce input between specified collection. This is a distributed API.
- Parameters
-
input (list|numpy.array) – The input variable to do all_reduce between specified collection.
mode (str) – “sum” or “min” or “max”.
comm_world (str, optional) – Collection used to execute all_reduce operation. Supported collections incude worker , server and all . The default is worker .
- Returns
-
A numpy array with the same shape as the input .
- Return type
-
output(Numpy.array|None)
Examples
# Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` . import paddle.distributed.fleet as fleet from paddle.distributed.fleet import PaddleCloudRoleMaker import sys import numpy as np import os os.environ["PADDLE_WITH_GLOO"] = "2" def train(): role = PaddleCloudRoleMaker( is_collective=False, init_gloo=True, path="./tmp_gloo") fleet.init(role) if fleet.is_server(): input = [1, 2] output = fleet.util.all_reduce(input, "sum", "server") print(output) # [2, 4] elif fleet.is_worker(): input = np.array([3, 4]) output = fleet.util.all_reduce(input, "sum", "worker") print(output) # [6, 8] output = fleet.util.all_reduce(input, "sum", "all") print(output) # [8, 12] if __name__ == "__main__": train()
-
barrier
(
comm_world='worker'
)
barrier¶
-
Barrier between specified collection.
- Parameters
-
comm_world (str, optional) – Collection used to execute barrier operation. Supported collections incude worker , server and all . The default is worker .
Examples
# Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` . import paddle.distributed.fleet as fleet from paddle.distributed.fleet import PaddleCloudRoleMaker import sys import os os.environ["PADDLE_WITH_GLOO"] = "2" def train(): role = PaddleCloudRoleMaker( is_collective=False, init_gloo=True, path="./tmp_gloo") fleet.init(role) if fleet.is_server(): fleet.util.barrier("server") print("all server arrive here") elif fleet.is_worker(): fleet.util.barrier("worker") print("all server arrive here") fleet.util.barrier("all") print("all servers and workers arrive here") if __name__ == "__main__": train()
-
all_gather
(
input,
comm_world='worker'
)
all_gather¶
-
All gather input between specified collection.
- Parameters
-
input (Int|Float) – The input variable to do all_gather between specified collection.
comm_world (str, optional) – Collection used to execute all_reduce operation. Supported collections incude worker , server and all . The default is worker .
- Returns
-
A list of gathered values.
- Return type
-
output (List)
Examples
# Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` . import paddle.distributed.fleet as fleet from paddle.distributed.fleet import PaddleCloudRoleMaker import sys import os os.environ["PADDLE_WITH_GLOO"] = "2" def train(): role = PaddleCloudRoleMaker( is_collective=False, init_gloo=True, path="./tmp_gloo") fleet.init(role) if fleet.is_server(): input = fleet.server_index() output = fleet.util.all_gather(input, "server") print(output) # output = [0, 1] elif fleet.is_worker(): input = fleet.worker_index() output = fleet.util.all_gather(input, "worker") # output = [0, 1] print(output) output = fleet.util.all_gather(input, "all") print(output) # output = [0, 1, 0, 1] if __name__ == "__main__": train()
-
get_file_shard
(
files
)
get_file_shard¶
-
Split files before distributed training, and return filelist assigned to the current trainer.
example 1: files is [a, b, c ,d, e] and trainer_num = 2, then trainer 0 gets [a, b, c] and trainer 1 gets [d, e]. example 2: files is [a, b], and trainer_num = 3, then trainer 0 gets [a], trainer 1 gets [b], trainer 2 gets []
- Parameters
-
files (list) – File list need to be read.
- Returns
-
Files belong to this worker.
- Return type
-
List
Examples
import paddle.distributed.fleet as fleet from paddle.distributed.fleet import UserDefinedRoleMaker role = UserDefinedRoleMaker( is_collective=False, init_gloo=False, current_id=0, role=fleet.Role.WORKER, worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"], server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"]) fleet.init(role) files = fleet.util.get_file_shard(["file1", "file2", "file3"]) print(files) # files = ["file1", "file2"]
-
print_on_rank
(
message,
rank_id
)
print_on_rank¶
-
Woker of rank rank_id print some message.
- Parameters
-
message (str) – Log to be printed.
rank_id (int) – trainer id.
Examples
import paddle.distributed.fleet as fleet from paddle.distributed.fleet import UserDefinedRoleMaker role = UserDefinedRoleMaker( is_collective=False, init_gloo=False, current_id=0, role=fleet.Role.WORKER, worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"], server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"]) fleet.init(role) fleet.util.print_on_rank("I'm worker 0", 0)
-
all_reduce
(
input,
mode='sum',
comm_world='worker'
)