HDFSClient

class paddle.distributed.fleet.utils. HDFSClient ( hadoop_home, configs, time_out=300000, sleep_inter=1000 ) [source]

A tool of HDFS.

Parameters
  • hadoop_home (str) – Hadoop home.

  • configs (dict) – Hadoop config. It is a dictionary and needs to contain the keys: “fs.default.name” and “hadoop.job.ugi”.

Examples

>>> 
>>> from paddle.distributed.fleet.utils import HDFSClient
>>> hadoop_home = "/home/client/hadoop-client/hadoop/"

>>> configs = {
...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
...     "hadoop.job.ugi": "hello,hello123"
... }

>>> client = HDFSClient(hadoop_home, configs)
>>> client.ls_dir("hdfs:/test_hdfs_client")
([], [])
list_dirs ( fs_path )

list_dirs

Only list directories under fs_path .

Parameters

fs_path (str) – The HDFS file path.

Returns

A list of all its subdirectories, e.g. [subdirname1, subdirname1, …].

Return type

List

Examples

>>> 
>>> from paddle.distributed.fleet.utils import HDFSClient

>>> hadoop_home = "/home/client/hadoop-client/hadoop/"
>>> configs = {
...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
...     "hadoop.job.ugi": "hello,hello123"
... }

>>> client = HDFSClient(hadoop_home, configs)
>>> subdirs = client.list_dirs("hdfs:/test_hdfs_client")
ls_dir ( fs_path )

ls_dir

List directories and files under fs_path .

Parameters

fs_path (str) – The HDFS file path.

Returns

Return a 2-tuple, the first element is the list of all its subdirectories, and the second one is the list of all its subfiles, e.g. ([subdirname1, subdirname1, …], [filename1, filename2, …]).

Return type

Tuple

Examples

>>> 
>>> from paddle.distributed.fleet.utils import HDFSClient

>>> hadoop_home = "/home/client/hadoop-client/hadoop/"
>>> configs = {
...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
...     "hadoop.job.ugi": "hello,hello123"
... }

>>> client = HDFSClient(hadoop_home, configs)
>>> subdirs, files = client.ls_dir("hdfs:/test_hdfs_client")
is_dir ( fs_path )

is_dir

Whether the remote HDFS path is a directory.

Parameters

fs_path (str) – The HDFS file path.

Returns

Return true if the path exists and it’s a directory, otherwise return false.

Return type

Bool

Examples

>>> 
>>> from paddle.distributed.fleet.utils import HDFSClient

>>> hadoop_home = "/home/client/hadoop-client/hadoop/"
>>> configs = {
...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
...     "hadoop.job.ugi": "hello,hello123"
... }

>>> client = HDFSClient(hadoop_home, configs)
>>> ret = client.is_file("hdfs:/test_hdfs_client")
is_file ( fs_path )

is_file

Whether the remote HDFS path is a file.

Parameters

fs_path (str) – The HDFS file path.

Returns

Return true if the path exists and it’s a file, otherwise return false.

Return type

Bool

Examples

>>> 
>>> from paddle.distributed.fleet.utils import HDFSClient

>>> hadoop_home = "/home/client/hadoop-client/hadoop/"
>>> configs = {
...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
...     "hadoop.job.ugi": "hello,hello123"
... }

>>> client = HDFSClient(hadoop_home, configs)
>>> ret = client.is_file("hdfs:/test_hdfs_client")
is_exist ( fs_path )

is_exist

Whether the remote HDFS path exists.

Parameters

fs_path (str) – The hdfs file path.

Returns

Whether it’s is file or directory, return true if the path exists, otherwise return false.

Return type

Bool

Examples

>>> 
>>> from paddle.distributed.fleet.utils import HDFSClient

>>> hadoop_home = "/home/client/hadoop-client/hadoop/"
>>> configs = {
...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
...     "hadoop.job.ugi": "hello,hello123"
... }

>>> client = HDFSClient(hadoop_home, configs)
>>> ret = client.is_exist("hdfs:/test_hdfs_client")
upload_dir ( local_dir, dest_dir, overwrite=False )

upload_dir

upload dir to hdfs :param local_dir: local dir :type local_dir: str :param dest_dir: hdfs dest dir :type dest_dir: str :param overwrite: is overwrite :type overwrite: bool

Returns

return code

upload ( local_path, fs_path, multi_processes=5, overwrite=False )

upload

Upload the local path to remote HDFS.

Parameters
  • local_path (str) – The local path.

  • fs_path (str) – The HDFS path.

  • multi_processes (int|1) – the upload data process at the same time, default=5

  • overwrite (bool|False) – will overwrite file on HDFS or not

Examples

>>> 
>>> from paddle.distributed.fleet.utils import HDFSClient

>>> hadoop_home = "/home/client/hadoop-client/hadoop/"
>>> configs = {
...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
...     "hadoop.job.ugi": "hello,hello123"
... }

>>> client = HDFSClient(hadoop_home, configs)
>>> client.upload("test_hdfs_client", "hdfs:/test_hdfs_client")
download ( fs_path, local_path, multi_processes=5, overwrite=False )

download

Download remote HDFS path to the local.

Parameters
  • fs_path (str) – The HDFS path.

  • local_path (str) – The local path.

  • multi_processes (int|1) – the download data process at the same time, default=1

  • overwrite (bool) – is overwrite

Examples

>>> 
>>> from paddle.distributed.fleet.utils import HDFSClient

>>> hadoop_home = "/home/client/hadoop-client/hadoop/"
>>> configs = {
...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
...     "hadoop.job.ugi": "hello,hello123"
... }

>>> client = HDFSClient(hadoop_home, configs)
>>> client.download("hdfs:/test_hdfs_client", "./")
mkdirs ( fs_path )

mkdirs

Create a remote HDFS directory.

Parameters

fs_path (str) – The HDFS directory path.

Examples

>>> 
>>> from paddle.distributed.fleet.utils import HDFSClient

>>> hadoop_home = "/home/client/hadoop-client/hadoop/"
>>> configs = {
...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
...     "hadoop.job.ugi": "hello,hello123"
... }

>>> client = HDFSClient(hadoop_home, configs)
>>> client.mkdirs("hdfs:/test_hdfs_client")
mv ( fs_src_path, fs_dst_path, overwrite=False, test_exists=True )

mv

Move a remote HDFS file or directory from fs_src_path to fs_dst_path .

Parameters
  • fs_src_path (str) – Name of the file or directory, that’s needed to be moved.

  • fs_dst_path (str) – Name of the file or directory to which to move to.

  • overwrite (bool) – Whether to re-write fs_dst_path if that exists. Default is False.

  • test_exists (bool) – Check the existence of fs_src_path and fs_dst_path . When test_exists is set true, if fs_src_path doesn’t exist or fs_dst_path exists, program will throw an Exception.

Examples

>>> 
>>> from paddle.distributed.fleet.utils import HDFSClient

>>> hadoop_home = "/home/client/hadoop-client/hadoop/"
>>> configs = {
...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
...     "hadoop.job.ugi": "hello,hello123"
... }

>>> client = HDFSClient(hadoop_home, configs)
>>> client.mv("hdfs:/test_hdfs_client", "hdfs:/test_hdfs_client2")
delete ( fs_path )

delete

Delete a remote HDFS path, whether it’s a file or directory.

Parameters

fs_path (str) – The HDFS file path.

Examples

>>> 
>>> from paddle.distributed.fleet.utils import HDFSClient

>>> hadoop_home = "/home/client/hadoop-client/hadoop/"
>>> configs = {
...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
...     "hadoop.job.ugi": "hello,hello123"
... }

>>> client = HDFSClient(hadoop_home, configs)
>>> client.delete("hdfs:/test_hdfs_client")
touch ( fs_path, exist_ok=True )

touch

Create a remote HDFS file.

Parameters
  • fs_path (str) – The HDFS file path.

  • exist_ok (bool) – When fs_path exists, if exist_ok is set false,

  • true. (program will throw an Exception. Default is) –

Examples

>>> 
>>> from paddle.distributed.fleet.utils import HDFSClient

>>> hadoop_home = "/home/client/hadoop-client/hadoop/"
>>> configs = {
...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
...     "hadoop.job.ugi": "hello,hello123"
... }

>>> client = HDFSClient(hadoop_home, configs)
>>> client.touch("hdfs:/test_hdfs_client")
cat ( fs_path=None )

cat

Cat a remote HDFS file.

Parameters

fs_path (str) – The HDFS file path.

Returns

file content

Examples

>>> 
>>> from paddle.distributed.fleet.utils import HDFSClient

>>> hadoop_home = "/home/client/hadoop-client/hadoop/"
>>> configs = {
...     "fs.default.name": "hdfs://xxx.hadoop.com:54310",
...     "hadoop.job.ugi": "hello,hello123"
... }

>>> client = HDFSClient(hadoop_home, configs)
>>> client.cat("hdfs:/test_hdfs_client")
''
list_files_info ( path_list )

list_files_info

list_files return file path and size :param path_list: file list :type path_list: list

Returns

file list with file path and size

Return type

filelist(list)