Source code for mindspore.dataset.utils.line_reader

# Copyright 2023 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Efficient line-based file reading.
"""
import os

from mindspore import log as logger
from ..core.validator_helpers import check_filename, check_uint64, check_value


[docs]class LineReader: """ Line-based file reader. Cache the line-based meta data of the file in advance to achieve random-access reading of each file line. Args: filename (str): Filename to be read. Raises: TypeError: If `filename` is not of type int. RuntimeError: If `filename` does not exist or is not a regular file. Examples: >>> from mindspore.dataset import LineReader >>> >>> reader = LineReader("/path/to/txt/or/csv/file") >>> # Read the first line of csv file >>> reader.readline(1) >>> # Return the row size in csv file >>> reader.len() >>> # Close the handle >>> reader.close() """ def __init__(self, filename): check_filename(filename) self.filename = os.path.realpath(filename) if not os.path.exists(self.filename): raise RuntimeError("The input file [{}] does not exist.".format(filename)) if not os.path.isfile(self.filename): raise RuntimeError("The input file [{}] is not a regular file.".format(filename)) # get the line offsets self.offsets = [0] with open(self.filename, mode='rb') as fo: while fo.readline(): self.offsets.append(fo.tell()) # pop the last empty line offset self.offsets.pop() if not self.offsets: logger.warning("The input file [{}] is empty.".format(filename)) # will be init in readline self.fo_handle = None def __getitem__(self, line): """Read specified line content""" return self.readline(line) def __len__(self): """Get the total number of lines in the file""" return self.len() def __del__(self): """Close the file when object released""" self.close()
[docs] def len(self): """Get the total number of lines in the current file.""" return len(self.offsets)
[docs] def readline(self, line): """ Reads the contents of the specified line. Args: line (int): The line number to be read, with a starting line number of 1. Returns: str, the contents of the corresponding line, without line break characters. Raises: TypeError: If `line` is not of type int. ValueError: If `line` exceeds the total number of lines in the file. """ check_uint64(line, "line") check_value(line, [1, len(self.offsets)], "line") if self.fo_handle is None: self.fo_handle = open(self.filename, mode="rt") self.fo_handle.seek(self.offsets[line - 1]) content = self.fo_handle.readline() # remove the line break character if content.endswith("\r\n"): content = content[:-2] elif content.endswith("\n"): content = content[:-1] elif content.endswith("\r"): content = content[:-1] return content
[docs] def close(self): """Close the file handle.""" if not hasattr(self, 'fo_handle'): return if self.fo_handle is None: return self.fo_handle.close() self.fo_handle = None