BuildStrategy¶

class paddle.static. BuildStrategy

BuildStrategy allows the user to more preciously control how to build the SSA Graph in ParallelExecutor by setting the property.

Returns: An BuildStrategy object.
Return type: BuildStrategy

Examples

import os
import paddle
import paddle.static as static

paddle.enable_static()

os.environ['CPU_NUM'] = str(2)
places = static.cpu_places()

data = static.data(name="x", shape=[None, 1], dtype="float32")
hidden = static.nn.fc(input=data, size=10)
loss = paddle.mean(hidden)
paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)

build_strategy = static.BuildStrategy()
build_strategy.enable_inplace = True
build_strategy.memory_optimize = True
build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce
program = static.CompiledProgram(static.default_main_program())
program = program.with_data_parallel(loss_name=loss.name,
                                      build_strategy=build_strategy,
                                      places=places)

class GradientScaleStrategy

Members:

CoeffNumDevice

One

Customized

property name

handle) -> str

Type: (self

class ReduceStrategy

Members:

Reduce

AllReduce

property name

handle) -> str

Type: (self

property debug_graphviz_path

debug_graphviz_path indicates the path that writing the SSA Graph to file in the form of graphviz. It is useful for debugging. Default is empty string, that is, “”

Examples

import paddle
import paddle.static as static

paddle.enable_static()

build_strategy = static.BuildStrategy()
build_strategy.debug_graphviz_path = "./graph"

Type: (str, optional)

property enable_auto_fusion

Whether to enable fusing subgraph to a fusion_group. Now we only support fusing subgraph that composed of elementwise-like operators, such as elementwise_add/mul without broadcast and activations.

Examples

import paddle
import paddle.static as static

paddle.enable_static()

build_strategy = static.BuildStrategy()
build_strategy.enable_auto_fusion = True

Type: (bool, optional)

property enable_sequential_execution

If set True, the execution order of ops would be the same as what is in the program. Default is False.

Examples

import paddle
import paddle.static as static

paddle.enable_static()

build_strategy = static.BuildStrategy()
build_strategy.enable_sequential_execution = True

Type: (bool, optional)

property fuse_bn_act_ops

fuse_bn_act_ops indicate whether to fuse batch_norm and activation_op, it may make the execution faster. Default is False.

Examples

import paddle
import paddle.static as static

paddle.enable_static()

build_strategy = static.BuildStrategy()
build_strategy.fuse_bn_act_ops = True

Type: (bool, optional)

property fuse_bn_add_act_ops

fuse_bn_add_act_ops indicate whether to fuse batch_norm, elementwise_add and activation_op, it may make the execution faster. Default is True

Examples

import paddle
import paddle.static as static

paddle.enable_static()

build_strategy = static.BuildStrategy()
build_strategy.fuse_bn_add_act_ops = True

Type: (bool, optional)

property fuse_broadcast_ops

fuse_broadcast_op indicates whether to fuse the broadcast ops. Note that, in Reduce mode, fusing broadcast ops may make the program faster. Because fusing broadcast OP equals delaying the execution of all broadcast Ops, in this case, all nccl streams are used only for NCCLReduce operations for a period of time. Default False.

Examples

import paddle
import paddle.static as static

paddle.enable_static()

build_strategy = static.BuildStrategy()
build_strategy.fuse_broadcast_ops = True

Type: (bool, optional)

property fuse_elewise_add_act_ops

fuse_elewise_add_act_ops indicate whether to fuse elementwise_add_op and activation_op, it may make the execution faster. Default is False.

Examples

import paddle
import paddle.static as static

paddle.enable_static()

build_strategy = static.BuildStrategy()
build_strategy.fuse_elewise_add_act_ops = True

Type: (bool, optional)

property fuse_relu_depthwise_conv

fuse_relu_depthwise_conv indicate whether to fuse relu and depthwise_conv2d, it will save GPU memory and may make the execution faster. This options is only available in GPU devices. Default is False.

Examples

import paddle
import paddle.static as static

paddle.enable_static()

build_strategy = static.BuildStrategy()
build_strategy.fuse_relu_depthwise_conv = True

Type: (bool, optional)

property gradient_scale_strategy

there are three ways of defining \(loss@grad\) in ParallelExecutor, that is, CoeffNumDevice, One and Customized. By default, ParallelExecutor sets the \(loss@grad\) according to the number of devices. If you want to customize \(loss@grad\), you can choose Customized. Default is ‘CoeffNumDevice’.

Examples

import numpy
import os
import paddle
import paddle.static as static

paddle.enable_static()

use_cuda = True
place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
exe = static.Executor(place)

# NOTE: If you use CPU to run the program, you need
# to specify the CPU_NUM, otherwise, paddle will use
# all the number of the logic core as the CPU_NUM,
# in that case, the batch size of the input should be
# greater than CPU_NUM, if not, the process will be
# failed by an exception.
if not use_cuda:
    os.environ['CPU_NUM'] = str(2)
    places = static.cpu_places()
else:
    places = static.cuda_places()

data = static.data(name='X', shape=[None, 1], dtype='float32')
hidden = static.nn.fc(input=data, size=10)
loss = paddle.mean(hidden)
paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)

exe.run(static.default_startup_program())

build_strategy = static.BuildStrategy()
build_strategy.gradient_scale_strategy = \
          static.BuildStrategy.GradientScaleStrategy.Customized
compiled_prog = static.CompiledProgram(
          static.default_main_program()).with_data_parallel(
                  loss_name=loss.name, build_strategy=build_strategy,
                  places=places)

dev_count =  len(places)
x = numpy.random.random(size=(10, 1)).astype('float32')
loss_grad = numpy.ones((dev_count)).astype("float32") * 0.01
loss_grad_name = loss.name+"@GRAD"
loss_data = exe.run(compiled_prog,
                      feed={"X": x, loss_grad_name : loss_grad},
                      fetch_list=[loss.name, loss_grad_name])

Type: (paddle.static.BuildStrategy.GradientScaleStrategy, optional)

property memory_optimize

memory opitimize aims to save total memory consumption, set to True to enable it.

Default None. None means framework would choose to use or not use this strategy automatically. Currently, None means that it is enabled when GC is disabled, and disabled when GC is enabled. True means enabling and False means disabling. Default is None.

Examples

import paddle
import paddle.static as static

paddle.enable_static()

build_strategy = static.BuildStrategy()
build_strategy.memory_optimize = True

Type: (bool, optional)

property reduce_strategy

there are two reduce strategies in ParallelExecutor, AllReduce and Reduce. If you want that all the parameters’ optimization are done on all devices independently, you should choose AllReduce; otherwise, if you choose Reduce, all the parameters’ optimization will be evenly distributed to different devices, and then broadcast the optimized parameter to other devices. Default is ‘AllReduce’.

Examples

import paddle
import paddle.static as static

paddle.enable_static()

build_strategy = static.BuildStrategy()
build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce

Type: (fluid.BuildStrategy.ReduceStrategy, optional)

property remove_unnecessary_lock

If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default is True.

Examples

import paddle
import paddle.static as static

paddle.enable_static()

build_strategy = static.BuildStrategy()
build_strategy.remove_unnecessary_lock = True

Type: (bool, optional)

property sync_batch_norm

sync_batch_norm indicates whether to use synchronous batch normalization which synchronizes the mean and variance through multi-devices in training phase. Current implementation doesn’t support FP16 training and CPU. And only synchronous on one machine, not all machines. Default is False.

Examples

import paddle
import paddle.static as static

paddle.enable_static()

build_strategy = static.BuildStrategy()
build_strategy.sync_batch_norm = True

Type: (bool, optional)