ParallelEnv

class paddle.distributed. ParallelEnv [source]

Note

This API is not recommended, if you need to get rank and world_size, it is recommended to use paddle.distributed.get_rank() and paddle.distributed.get_world_size() .

This class is used to obtain the environment variables required for the parallel execution of paddle.nn.Layer in dynamic mode.

The parallel execution in dynamic mode needs to be started using paddle.distributed.launch or paddle.distributed.spawn .

Examples

>>> 
>>> import paddle
>>> import paddle.distributed as dist

>>> def train():
...     # 1. initialize parallel environment
...     dist.init_parallel_env()
...     # 2. get current ParallelEnv
...     parallel_env = dist.ParallelEnv()
...     print("rank: ", parallel_env.rank)
...     print("world_size: ", parallel_env.world_size)

>>> if __name__ == '__main__':
...     # 1. start by ``paddle.distributed.spawn`` (default)
...     dist.spawn(train, nprocs=2)
...     # 2. start by ``paddle.distributed.launch``
...     train()

# Print result in process 1:
rank: 1
world_size: 2

# Print result in process 2:
rank: 2
world_size: 2

property rank : int

Rank of current trainer.

Its value is equal to the value of the environment variable PADDLE_TRAINER_ID . The default value is 0.

Examples

>>> 
>>> # execute this command in terminal: export PADDLE_TRAINER_ID=0
>>> import paddle.distributed as dist

>>> env = dist.ParallelEnv()
>>> print("The rank is %d" % env.rank)
The rank is 0

property world_size : int

The number of trainers (number of processes participating in current job).

Its value is equal to the value of the environment variable PADDLE_TRAINERS_NUM . The default value is 1.

Examples

>>> 
>>> # execute this command in terminal: export PADDLE_TRAINERS_NUM=4
>>> import paddle.distributed as dist

>>> env = dist.ParallelEnv()
>>> print("The world_size is %d" % env.world_size)
The world_size is 4

property device_id : int

The ID of selected GPU card for parallel training.

Its value is equal to the value of the environment variable FLAGS_selected_gpus . The default value is 0.

Examples

>>> 
>>> # execute this command in terminal: export FLAGS_selected_gpus=1
>>> import paddle.distributed as dist

>>> env = dist.ParallelEnv()
>>> print("The device id are %d" % env.device_id)
The device id are 1

property device_type : str

The type of custom device for parallel training.

Its value is equal to the value of the environment variable PADDLE_XCCL_BACKEND . The default value is None.

property current_endpoint : str

The endpoint of current trainer, it is in the form of (node IP + port).

Its value is equal to the value of the environment variable PADDLE_CURRENT_ENDPOINT . The default value is “”.

Examples

>>> 
>>> # execute this command in terminal: export PADDLE_CURRENT_ENDPOINT=127.0.0.1:6170
>>> import paddle.distributed as dist

>>> env = dist.ParallelEnv()
>>> print("The current endpoint are %s" % env.current_endpoint)
The current endpoint are 127.0.0.1:6170

property trainer_endpoints : list[str]

The endpoints of all trainer nodes in the task, which are used to broadcast the NCCL ID when NCCL2 is initialized.

Its value is equal to the value of the environment variable PADDLE_TRAINER_ENDPOINTS . The default value is “”.

Examples

>>> 
>>> # execute this command in terminal: export PADDLE_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171
>>> import paddle.distributed as dist

>>> env = dist.ParallelEnv()
>>> print("The trainer endpoints are %s" % env.trainer_endpoints)
The trainer endpoints are ['127.0.0.1:6170', '127.0.0.1:6171']

property nrings : int

Nrings of current trainer.

Its value is equal to the value of the environment variable FLAGS_nccl_nrings . The default value is 1.

Examples

>>> 
>>> # execute this command in terminal: export FLAGS_nccl_nrings=1
>>> import paddle.distributed as dist

>>> env = dist.ParallelEnv()
>>> print("The nrings is %d" % env.nrings)
The nrings is 1

property pg_timeout : int

timeout of process group.

Its value is equal to the value of the environment variable PADDLE_PG_TIMEOUT . The default value is 30 minutes.

Examples

>>> # execute this command in terminal: export PADDLE_PG_TIMEOUT=1800000
>>> import paddle.distributed as dist

>>> env = dist.ParallelEnv()
>>> # the pg_timeout of process group 1800000

property local_rank : int

Rank of current trainer.

Its value is equal to the value of the environment variable PADDLE_TRAINER_ID . The default value is 0.

Examples

>>> 
>>> # execute this command in terminal: export PADDLE_TRAINER_ID=0
>>> import paddle.distributed as dist

>>> env = dist.ParallelEnv()
>>> print("The rank is %d" % env.rank)
The rank is 0

property nranks : int

The number of trainers (number of processes participating in current job).

Its value is equal to the value of the environment variable PADDLE_TRAINERS_NUM . The default value is 1.

Examples

>>> 
>>> # execute this command in terminal: export PADDLE_TRAINERS_NUM=4
>>> import paddle.distributed as dist

>>> env = dist.ParallelEnv()
>>> print("The world_size is %d" % env.world_size)
The world_size is 4

property dev_id : int

The ID of selected GPU card for parallel training.

Its value is equal to the value of the environment variable FLAGS_selected_gpus . The default value is 0.

Examples

>>> 
>>> # execute this command in terminal: export FLAGS_selected_gpus=1
>>> import paddle.distributed as dist

>>> env = dist.ParallelEnv()
>>> print("The device id are %d" % env.device_id)
The device id are 1