Source code for hdfs3.core

# -*- coding: utf-8 -*-
"Main module defining filesystem and file classes"
from __future__ import absolute_import

import ctypes
import logging
import os
import re
import warnings
import posixpath
from collections import deque

from .compatibility import FileNotFoundError, ConnectionError
from .conf import conf
from .utils import (read_block, seek_delimiter, ensure_bytes, ensure_string,
                    ensure_trailing_slash, MyNone)

logger = logging.getLogger(__name__)
_lib = None


[docs]class HDFileSystem(object):
    """ Connection to an HDFS namenode

    >>> hdfs = HDFileSystem(host='127.0.0.1', port=8020)  # doctest: +SKIP
    """
    _first_pid = None

    def __init__(self, host=MyNone, port=MyNone, connect=True, autoconf=True,
                 pars=None, **kwargs):
        """
        Parameters
        ----------
        host: str; port: int
            Overrides which take precedence over information in conf files and
            other passed parameters
        connect: bool (True)
            Whether to automatically attempt to establish a connection to the
            name-node.
        autoconf: bool (True)
            Whether to use the configuration found in the conf module as
            the set of defaults
        pars : {str: str}
            any parameters for hadoop, that you can find in hdfs-site.xml,
            https://hadoop.apache.org/docs/r2.6.0/hadoop-project-dist/hadoop-hdfs/hdfs-default.xml
            This dict looks exactly like the one produced by conf - you can,
            for example, remove any problematic entries.
        kwargs: key/value
            Further override parameters.
            These are applied after the default conf and pars; the most typical
            things to set are:
            host : str (localhost)
                namenode hostname or IP address, in case of HA mode it is name
                of the cluster that can be found in "fs.defaultFS" option.
            port : int (8020)
                namenode RPC port usually 8020, in HA mode port mast be None
            user, ticket_cache, token : str
                kerberos things
        """
        self.conf = conf.copy() if autoconf else {}
        if pars:
            self.conf.update(pars)
        self.conf.update(kwargs)
        if host is not MyNone:
            self.conf['host'] = host
        if port is not MyNone:
            self.conf['port'] = port

        self._handle = None

        if self.conf.get('ticket_cache') and self.conf.get('token'):
            m = "It is not possible to use ticket_cache and token at same time"
            raise RuntimeError(m)

        if connect:
            self.connect()

    def __getstate__(self):
        d = self.__dict__.copy()
        del d['_handle']
        logger.debug("Serialize with state: %s", d)
        return d

    def __setstate__(self, state):
        self.__dict__.update(state)
        self._handle = None
        self.connect()

[docs]    def connect(self):
        """ Connect to the name node

        This happens automatically at startup
        """
        get_lib()
        conf = self.conf.copy()
        if self._handle:
            return

        if HDFileSystem._first_pid is None:
            HDFileSystem._first_pid = os.getpid()
        elif HDFileSystem._first_pid != os.getpid():
            warnings.warn("Attempting to re-use hdfs3 in child process %d, "
                          "but it was initialized in parent process %d. "
                          "Beware that hdfs3 is not fork-safe and this may "
                          "lead to bugs or crashes."
                          % (os.getpid(), HDFileSystem._first_pid),
                          RuntimeWarning, stacklevel=2)

        o = _lib.hdfsNewBuilder()

        _lib.hdfsBuilderSetNameNode(o, ensure_bytes(conf.pop('host')))

        port = conf.pop('port', None)
        if port is not None:
            _lib.hdfsBuilderSetNameNodePort(o, port)

        user = conf.pop('user', None)
        if user is not None:
            _lib.hdfsBuilderSetUserName(o, ensure_bytes(user))

        ticket_cache = conf.pop('ticket_cache', None)
        if ticket_cache is not None:
            _lib.hdfsBuilderSetKerbTicketCachePath(o, ensure_bytes(ticket_cache))

        token = conf.pop('token', None)
        if token is not None:
            _lib.hdfsBuilderSetToken(o, ensure_bytes(token))

        for par, val in conf.items():
            if not _lib.hdfsBuilderConfSetStr(o, ensure_bytes(par),
                                              ensure_bytes(val)) == 0:
                warnings.warn('Setting conf parameter %s failed' % par)

        fs = _lib.hdfsBuilderConnect(o)
        _lib.hdfsFreeBuilder(o)
        if fs:
            logger.debug("Connect to handle %d", fs.contents.filesystem)
            self._handle = fs
        else:
            msg = ensure_string(_lib.hdfsGetLastError()).split('\n')[0]
            raise ConnectionError('Connection Failed: {}'.format(msg))

[docs]    def delegate_token(self, user=None):
        """Generate delegate auth token.

        Parameters
        ----------
        user: bytes/str
            User to pass to delegation (defaults to user supplied to instance);
            this user is the only one that can renew the token.
        """
        if user is None and self.user is None:
            raise ValueError('Delegation requires a user')
        user = user or self.user
        out = _lib.hdfsGetDelegationToken(self._handle, ensure_bytes(user))
        if out:
            self.token = out
            return out
        else:
            raise RuntimeError('Token delegation failed')

[docs]    def renew_token(self, token=None):
        """
        Renew delegation token

        Parameters
        ----------
        token: str or None
            If None, uses the instance's token. It is an error to do that if
            there is no token.

        Returns
        -------
        New expiration time for the token
        """
        token = token or self.token
        if token is None:
            raise ValueError('There is no token to renew')
        return _lib.hdfsRenewDelegationToken(self._handle, ensure_bytes(token))

[docs]    def cancel_token(self, token=None):
        """
        Revoke delegation token

        Parameters
        ----------
        token: str or None
            If None, uses the instance's token. It is an error to do that if
            there is no token.
        """
        token = token or self.token
        if token is None:
            raise ValueError('There is no token to cancel')
        out = _lib.hdfsCancelDelegationToken(self._handle, ensure_bytes(token))
        if out:
            raise RuntimeError('Token cancel failed')
        if token == self.token:
            # now our token is invalid - this FS may not work
            self.token = None

[docs]    def disconnect(self):
        """ Disconnect from name node """
        if self._handle:
            logger.debug("Disconnect from handle %d",
                         self._handle.contents.filesystem)
            _lib.hdfsDisconnect(self._handle)
        self._handle = None

[docs]    def open(self, path, mode='rb', replication=0, buff=0, block_size=0):
        """ Open a file for reading or writing

        Parameters
        ----------
        path: string
            Path of file on HDFS
        mode: string
            One of 'rb', 'wb', or 'ab'
        replication: int
            Replication factor; if zero, use system default (only on write)
        buf: int (=0)
            Client buffer size (bytes); if 0, use default.
        block_size: int
            Size of data-node blocks if writing
        """
        if not self._handle:
            raise IOError("Filesystem not connected")
        if block_size and mode != 'wb':
            raise ValueError('Block size only valid when writing new file')
        if ('a' in mode and self.exists(path) and
                replication != 0 and replication > 1):
            raise IOError("Appending to an existing file with replication > 1"
                          " is unsupported")
        if 'b' not in mode:
            raise NotImplementedError("Text mode not supported, use mode='%s'"
                                      " and manage bytes" % (mode + 'b'))
        return HDFile(self, path, mode, replication=replication, buff=buff,
                      block_size=block_size)

[docs]    def du(self, path, total=False, deep=False):
        """Returns file sizes on a path.

        Parameters
        ----------
        path : string
            where to look
        total : bool (False)
            to add up the sizes to a grand total
        deep : bool (False)
            whether to recurse into subdirectories
        """
        fi = self.ls(path, True)
        if deep:
            for apath in fi:
                if apath['kind'] == 'directory':
                    fi.extend(self.ls(apath['name'], True))
        if total:
            return {path: sum(f['size'] for f in fi)}
        return {p['name']: p['size'] for p in fi}

[docs]    def df(self):
        """ Used/free disc space on the HDFS system """
        cap = _lib.hdfsGetCapacity(self._handle)
        used = _lib.hdfsGetUsed(self._handle)
        return {'capacity': cap,
                'used': used,
                'percent-free': 100 * (cap - used) / cap}

[docs]    def get_block_locations(self, path, start=0, length=0):
        """ Fetch physical locations of blocks """
        if not self._handle:
            raise IOError("Filesystem not connected")
        start = int(start) or 0
        length = int(length) or self.info(path)['size']
        nblocks = ctypes.c_int(0)
        out = _lib.hdfsGetFileBlockLocations(self._handle,
                                             ensure_bytes(path),
                                             ctypes.c_int64(start),
                                             ctypes.c_int64(length),
                                             ctypes.byref(nblocks))
        locs = []
        for i in range(nblocks.value):
            block = out[i]
            hosts = [block.hosts[i] for i in
                     range(block.numOfNodes)]
            locs.append({'hosts': hosts, 'length': block.length,
                         'offset': block.offset})
        _lib.hdfsFreeFileBlockLocations(out, nblocks)
        return locs

[docs]    def info(self, path):
        """ File information (as a dict) """
        if not self.exists(path):
            raise FileNotFoundError(path)
        fi = _lib.hdfsGetPathInfo(self._handle, ensure_bytes(path)).contents
        out = info_to_dict(fi)
        _lib.hdfsFreeFileInfo(ctypes.byref(fi), 1)
        return ensure_string(out)

[docs]    def isdir(self, path):
        """Return True if path refers to an existing directory."""
        try:
            info = self.info(path)
            return info['kind'] == 'directory'
        except EnvironmentError:
            return False

[docs]    def isfile(self, path):
        """Return True if path refers to an existing file."""
        try:
            info = self.info(path)
            return info['kind'] == 'file'
        except EnvironmentError:
            return False

[docs]    def walk(self, path):
        """Directory tree generator, see ``os.walk``"""
        full_dirs = []
        dirs = []
        files = []

        for info in self.ls(path, True):
            name = info['name']
            tail = posixpath.split(name)[1]
            if info['kind'] == 'directory':
                full_dirs.append(name)
                dirs.append(tail)
            else:
                files.append(tail)

        yield path, dirs, files

        for d in full_dirs:
            for res in self.walk(d):
                yield res

[docs]    def glob(self, path):
        """ Get list of paths mathing glob-like pattern (i.e., with "*"s).

        If passed a directory, gets all contained files; if passed path
        to a file, without any "*", returns one-element list containing that
        filename. Does not support python3.5's "**" notation.
        """
        path = ensure_string(path)
        try:
            f = self.info(path)
            if f['kind'] == 'directory' and '*' not in path:
                path = ensure_trailing_slash(path) + '*'
            else:
                return [f['name']]
        except IOError:
            pass
        if '/' in path[:path.index('*')]:
            ind = path[:path.index('*')].rindex('/')
            root = path[:ind + 1]
        else:
            root = '/'
        allpaths = []
        for dirname, dirs, fils in self.walk(root):
            allpaths.extend(posixpath.join(dirname, d) for d in dirs)
            allpaths.extend(posixpath.join(dirname, f) for f in fils)
        pattern = re.compile("^" + path.replace('//', '/')
                                       .rstrip('/')
                                       .replace('*', '[^/]*')
                                       .replace('?', '.') + "$")
        return [p for p in allpaths
                if pattern.match(p.replace('//', '/').rstrip('/'))]

[docs]    def ls(self, path, detail=False):
        """ List files at path

        Parameters
        ----------
        path : string/bytes
            location at which to list files
        detail : bool (=True)
            if True, each list item is a dict of file properties;
            otherwise, returns list of filenames
        """
        if not self.exists(path):
            raise FileNotFoundError(path)
        num = ctypes.c_int(0)
        fi = _lib.hdfsListDirectory(self._handle, ensure_bytes(path),
                                    ctypes.byref(num))
        out = [ensure_string(info_to_dict(fi[i])) for i in range(num.value)]
        _lib.hdfsFreeFileInfo(fi, num.value)
        if detail:
            return out
        else:
            return [o['name'] for o in out]

    @property
    def host(self):
        return self.conf.get('host', '')

    @property
    def port(self):
        return self.conf.get('port', '')

    def __repr__(self):
        if self._handle is None:
            state = 'Disconnected'
        else:
            state = 'Connected'
        return 'hdfs://%s:%s, %s' % (self.host, self.port, state)

    def __del__(self):
        if self._handle:
            self.disconnect()

[docs]    def mkdir(self, path):
        """ Make directory at path """
        out = _lib.hdfsCreateDirectory(self._handle, ensure_bytes(path))
        if out != 0:
            msg = ensure_string(_lib.hdfsGetLastError()).split('\n')[0]
            raise IOError('Create directory failed: {}'.format(msg))

[docs]    def makedirs(self, path, mode=0o711):
        """ Create directory together with any necessary intermediates """
        out = _lib.hdfsCreateDirectoryEx(self._handle, ensure_bytes(path),
                                         ctypes.c_short(mode), 1)
        if out != 0:
            msg = ensure_string(_lib.hdfsGetLastError()).split('\n')[0]
            raise IOError('Create directory failed: {}'.format(msg))

[docs]    def set_replication(self, path, replication):
        """ Instruct HDFS to set the replication for the given file.

        If successful, the head-node's table is updated immediately, but
        actual copying will be queued for later. It is acceptable to set
        a replication that cannot be supported (e.g., higher than the
        number of data-nodes).
        """
        if replication < 0:
            raise ValueError('Replication must be positive,'
                             ' or 0 for system default')
        out = _lib.hdfsSetReplication(self._handle, ensure_bytes(path),
                                      ctypes.c_int16(int(replication)))
        if out != 0:
            msg = ensure_string(_lib.hdfsGetLastError()).split('\n')[0]
            raise IOError('Set replication failed: {}'.format(msg))

[docs]    def mv(self, path1, path2):
        """ Move file at path1 to path2 """
        if not self.exists(path1):
            raise FileNotFoundError(path1)
        out = _lib.hdfsRename(self._handle, ensure_bytes(path1),
                              ensure_bytes(path2))
        return out == 0

[docs]    def concat(self, destination, paths):
        """Concatenate inputs to destination

        Source files *should* all have the same block size and replication.
        The destination file must be in the same directory as
        the source files. If the target exists, it will be appended to.

        Some HDFSs impose that the target file must exist and be an exact
        number of blocks long, and that each concated file except the last
        is also a whole number of blocks.

        The source files are deleted on successful
        completion.
        """
        if not self.exists(destination):
            self.touch(destination)
        arr = (ctypes.c_char_p * (len(paths) + 1))()
        arr[:-1] = [ensure_bytes(s) for s in paths]
        arr[-1] = ctypes.c_char_p()  # NULL pointer
        out = _lib.hdfsConcat(self._handle, ensure_bytes(destination), arr)
        if out != 0:
            msg = ensure_string(_lib.hdfsGetLastError()).split('\n')[0]
            raise IOError('Concat failed on %s %s' % (destination, msg))

[docs]    def rm(self, path, recursive=True):
        "Use recursive for `rm -r`, i.e., delete directory and contents"
        if not self.exists(path):
            raise FileNotFoundError(path)
        out = _lib.hdfsDelete(self._handle, ensure_bytes(path), bool(recursive))
        if out != 0:
            msg = ensure_string(_lib.hdfsGetLastError()).split('\n')[0]
            raise IOError('Remove failed on %s %s' % (path, msg))

[docs]    def exists(self, path):
        """ Is there an entry at path? """
        out = _lib.hdfsExists(self._handle, ensure_bytes(path))
        return out == 0

[docs]    def chmod(self, path, mode):
        """Change access control of given path

        Exactly what permissions the file will get depends on HDFS
        configurations.

        Parameters
        ----------
        path : string
            file/directory to change
        mode : integer
            As with the POSIX standard, each octal digit refers to
            user-group-all, in that order, with read-write-execute as the
            bits of each group.

        Examples
        --------
        Make read/writeable to all
        >>> hdfs.chmod('/path/to/file', 0o777)  # doctest: +SKIP

        Make read/writeable only to user
        >>> hdfs.chmod('/path/to/file', 0o700)  # doctest: +SKIP

        Make read-only to user
        >>> hdfs.chmod('/path/to/file', 0o100)  # doctest: +SKIP
        """
        if not self.exists(path):
            raise FileNotFoundError(path)
        out = _lib.hdfsChmod(self._handle, ensure_bytes(path),
                             ctypes.c_short(mode))
        if out != 0:
            msg = ensure_string(_lib.hdfsGetLastError()).split('\n')[0]
            raise IOError("chmod failed on %s %s" % (path, msg))

[docs]    def chown(self, path, owner, group):
        """ Change owner/group """
        if not self.exists(path):
            raise FileNotFoundError(path)
        out = _lib.hdfsChown(self._handle, ensure_bytes(path),
                             ensure_bytes(owner), ensure_bytes(group))
        if out != 0:
            msg = ensure_string(_lib.hdfsGetLastError()).split('\n')[0]
            raise IOError("chown failed on %s %s" % (path, msg))

[docs]    def cat(self, path):
        """ Return contents of file """
        if not self.exists(path):
            raise FileNotFoundError(path)
        with self.open(path, 'rb') as f:
            result = f.read()
        return result

[docs]    def get(self, hdfs_path, local_path, blocksize=2**16):
        """ Copy HDFS file to local """
        # TODO: _lib.hdfsCopy() may do this more efficiently
        if not self.exists(hdfs_path):
            raise FileNotFoundError(hdfs_path)
        with self.open(hdfs_path, 'rb') as f:
            with open(local_path, 'wb') as f2:
                out = 1
                while out:
                    out = f.read(blocksize)
                    f2.write(out)

[docs]    def getmerge(self, path, filename, blocksize=2**16):
        """ Concat all files in path (a directory) to local output file """
        files = self.ls(path)
        with open(filename, 'wb') as f2:
            for apath in files:
                with self.open(apath, 'rb') as f:
                    out = 1
                    while out:
                        out = f.read(blocksize)
                        f2.write(out)

[docs]    def put(self, filename, path, chunk=2**16, replication=0, block_size=0):
        """ Copy local file to path in HDFS """
        with self.open(path, 'wb', replication=replication,
                       block_size=block_size) as target:
            with open(filename, 'rb') as source:
                while True:
                    out = source.read(chunk)
                    if len(out) == 0:
                        break
                    target.write(out)

[docs]    def tail(self, path, size=1024):
        """ Return last bytes of file """
        length = self.du(path)[ensure_trailing_slash(path)]
        if size > length:
            return self.cat(path)
        with self.open(path, 'rb') as f:
            f.seek(length - size)
            return f.read(size)

[docs]    def head(self, path, size=1024):
        """ Return first bytes of file """
        with self.open(path, 'rb') as f:
            return f.read(size)

[docs]    def touch(self, path):
        """ Create zero-length file """
        self.open(path, 'wb').close()

[docs]    def read_block(self, fn, offset, length, delimiter=None):
        """ Read a block of bytes from an HDFS file

        Starting at ``offset`` of the file, read ``length`` bytes.  If
        ``delimiter`` is set then we ensure that the read starts and stops at
        delimiter boundaries that follow the locations ``offset`` and ``offset
        + length``.  If ``offset`` is zero then we start at zero.  The
        bytestring returned will not include the surrounding delimiter strings.

        If offset+length is beyond the eof, reads to eof.

        Parameters
        ----------
        fn: string
            Path to filename on HDFS
        offset: int
            Byte offset to start read
        length: int
            Number of bytes to read
        delimiter: bytes (optional)
            Ensure reading starts and stops at delimiter bytestring

        Examples
        --------
        >>> hdfs.read_block('/data/file.csv', 0, 13)  # doctest: +SKIP
        b'Alice, 100\\nBo'
        >>> hdfs.read_block('/data/file.csv', 0, 13, delimiter=b'\\n')  # doctest: +SKIP
        b'Alice, 100\\nBob, 200'

        See Also
        --------
        hdfs3.utils.read_block
        """
        with self.open(fn, 'rb') as f:
            size = f.info()['size']
            if offset + length > size:
                length = size - offset
            bytes = read_block(f, offset, length, delimiter)
        return bytes

[docs]    def list_encryption_zones(self):
        """Get list of all the encryption zones"""
        x = ctypes.c_int(8)
        out = _lib.hdfsListEncryptionZones(self._handle, x)
        if not out:
            msg = ensure_string(_lib.hdfsGetLastError()).split('\n')[0]
            raise IOError("EZ listing failed: %s" % msg)

        res = [struct_to_dict(out[i]) for i in range(x.value)]
        if res:
            _lib.hdfsFreeEncryptionZoneInfo(out, x)
        return res

    def create_encryption_zone(self, path, key_name):
        out = _lib.hdfsCreateEncryptionZone(self._handle, ensure_bytes(path),
                                            ensure_bytes(key_name))
        if out != 0:
            msg = ensure_string(_lib.hdfsGetLastError()).split('\n')[0]
            raise IOError("EZ create failed: %s %s" % (path, msg))


def get_lib():
    """ Import C-lib only on demand """
    global _lib
    if _lib is None:
        from .lib import _lib as l
        _lib = l


def struct_to_dict(s):
    """ Return dictionary views of a simple ctypes record-like structure """
    return dict((ensure_string(name), getattr(s, name))
                for (name, p) in s._fields_)


def info_to_dict(s):
    """ Process data returned by hdfsInfo """
    d = struct_to_dict(s)
    d['kind'] = {68: 'directory', 70: 'file'}[d['kind']]
    if d['encryption_info']:
        d['encryption_info'] = struct_to_dict(d['encryption_info'].contents)
    else:
        d['encryption_info'] = None
    return d


mode_numbers = {'w': 1, 'r': 0, 'a': 1025,
                'wb': 1, 'rb': 0, 'ab': 1025}


[docs]class HDFile(object):
    """ File on HDFS

    Matches the standard Python file interface.

    Examples
    --------
    >>> with hdfs.open('/path/to/hdfs/file.txt') as f:  # doctest: +SKIP
    ...     bytes = f.read(1000)  # doctest: +SKIP
    >>> with hdfs.open('/path/to/hdfs/file.csv') as f:  # doctest: +SKIP
    ...     df = pd.read_csv(f, nrows=1000)  # doctest: +SKIP
    """
    def __init__(self, fs, path, mode, replication=0, buff=0, block_size=0):
        """ Called by open on a HDFileSystem """
        if 't' in mode:
            raise NotImplementedError("Opening a file in text mode is not"
                                      " supported, use ``io.TextIOWrapper``.")
        self.fs = fs
        self.path = path
        self.replication = replication
        self.buff = buff
        self._fs = fs._handle
        self.buffers = []
        self._handle = None
        self.mode = mode
        self.block_size = block_size
        self.lines = deque([])
        self._set_handle()

    def _set_handle(self):
        out = _lib.hdfsOpenFile(self._fs, ensure_bytes(self.path),
                                mode_numbers[self.mode], self.buff,
                                ctypes.c_short(self.replication),
                                ctypes.c_int64(self.block_size))
        if not out:
            msg = ensure_string(_lib.hdfsGetLastError()).split('\n')[0]
            raise IOError("Could not open file: %s, mode: %s %s" %
                          (self.path, self.mode, msg))
        self._handle = out

[docs]    def read(self, length=None):
        """ Read bytes from open file """
        if not _lib.hdfsFileIsOpenForRead(self._handle):
            raise IOError('File not read mode')
        buffers = []

        if length is None:
            out = 1
            while out:
                out = self.read(2**16)
                buffers.append(out)
        else:
            while length:
                bufsize = min(2**16, length)
                p = ctypes.create_string_buffer(bufsize)
                ret = _lib.hdfsRead(
                    self._fs, self._handle, p, ctypes.c_int32(bufsize))
                if ret == 0:
                    break
                if ret > 0:
                    if ret < bufsize:
                        buffers.append(p.raw[:ret])
                    elif ret == bufsize:
                        buffers.append(p.raw)
                    length -= ret
                else:
                    raise IOError('Read file %s Failed:' % self.path, -ret)

        return b''.join(buffers)

[docs]    def readline(self, chunksize=2**8, lineterminator='\n'):
        """ Return a line using buffered reading.

        A line is a sequence of bytes between ``'\n'`` markers (or given
        line-terminator).

        Line iteration uses this method internally.

        Note: this function requires many calls to HDFS and is slow; it is
        in general better to wrap an HDFile with an ``io.TextIOWrapper`` for
        buffering, text decoding and newline support.
        """
        lineterminator = ensure_bytes(lineterminator)
        start = self.tell()
        seek_delimiter(self, lineterminator, chunksize, allow_zero=False)
        end = self.tell()
        self.seek(start)
        return self.read(end - start)

    def _genline(self):
        while True:
            out = self.readline()
            if out:
                yield out
            else:
                raise StopIteration

    def __iter__(self):
        """ Enables `for line in file:` usage """
        return self._genline()

    def __next__(self):
        """ Enables reading a file as a buffer in pandas """
        out = self.readline()
        if out:
            return out
        else:
            raise StopIteration

    # PY2 compatibility
    next = __next__

[docs]    def readlines(self):
        """ Return all lines in a file as a list """
        return list(self)

[docs]    def tell(self):
        """ Get current byte location in a file """
        out = _lib.hdfsTell(self._fs, self._handle)
        if out == -1:
            msg = ensure_string(_lib.hdfsGetLastError()).split('\n')[0]
            raise IOError('Tell Failed on file %s %s' % (self.path, msg))
        return out

[docs]    def seek(self, offset, from_what=0):
        """ Set file read position. Read mode only.

        Attempt to move out of file bounds raises an exception. Note that,
        by the convention in python file seek, offset should be <=0 if
        from_what is 2.

        Parameters
        ----------
        offset : int
            byte location in the file.
        from_what : int 0, 1, 2
            if 0 (befault), relative to file start; if 1, relative to current
            location; if 2, relative to file end.

        Returns
        -------
        new position
        """
        if from_what not in {0, 1, 2}:
            raise ValueError('seek mode must be 0, 1 or 2')
        info = self.info()
        if from_what == 1:
            offset = offset + self.tell()
        elif from_what == 2:
            offset = info['size'] + offset
        if offset < 0 or offset > info['size']:
            raise ValueError('Attempt to seek outside file')
        out = _lib.hdfsSeek(self._fs, self._handle, ctypes.c_int64(offset))
        if out == -1:  # pragma: no cover
            msg = ensure_string(_lib.hdfsGetLastError()).split('\n')[0]
            raise IOError('Seek Failed on file %s' % (self.path, msg))
        return self.tell()

[docs]    def info(self):
        """ Filesystem metadata about this file """
        return self.fs.info(self.path)

[docs]    def write(self, data):
        """ Write bytes to open file (which must be in w or a mode) """
        data = ensure_bytes(data)
        if not data:
            return
        if not _lib.hdfsFileIsOpenForWrite(self._handle):
            msg = ensure_string(_lib.hdfsGetLastError()).split('\n')[0]
            raise IOError('File not write mode: {}'.format(msg))
        write_block = 64 * 2**20
        for offset in range(0, len(data), write_block):
            d = ensure_bytes(data[offset:offset + write_block])
            if not _lib.hdfsWrite(self._fs, self._handle, d, len(d)) == len(d):
                msg = ensure_string(_lib.hdfsGetLastError()).split('\n')[0]
                raise IOError('Write failed on file %s, %s' % (self.path, msg))
        return len(data)

[docs]    def flush(self):
        """ Send buffer to the data-node; actual write may happen later """
        _lib.hdfsFlush(self._fs, self._handle)

[docs]    def close(self):
        """ Flush and close file, ensuring the data is readable """
        self.flush()
        _lib.hdfsCloseFile(self._fs, self._handle)
        self._handle = None  # _libhdfs releases memory
        self.mode = 'closed'

    @property
    def read1(self):
        return self.read

    @property
    def closed(self):
        return self.mode == 'closed'

    def writable(self):
        return self.mode.startswith('w') or self.mode.startswith('a')

    def seekable(self):
        return self.readable()

    def readable(self):
        return self.mode.startswith('r')

    def __del__(self):
        self.close()

    def __repr__(self):
        return 'hdfs://%s:%s%s, %s' % (self.fs.host, self.fs.port,
                                       self.path, self.mode)

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()