Skip to content

Commit 76987fe

Browse files
committed
Resourcefixes
1 parent a05a61b commit 76987fe

File tree

1 file changed

+79
-78
lines changed

1 file changed

+79
-78
lines changed

src/xopen/__init__.py

Lines changed: 79 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,6 @@ def __init__( # noqa: C901
171171
compresslevel: Optional[int] = None,
172172
threads: Optional[int] = None,
173173
program_settings: _ProgramSettings = _ProgramSettings(("gzip", "--no-name")),
174-
closefd: bool = True,
175174
):
176175
"""
177176
mode -- one of 'w', 'wb', 'a', 'ab'
@@ -186,21 +185,14 @@ def __init__( # noqa: C901
186185
self._program_args = list(program_settings.program_args)
187186
self._allowed_exit_code = program_settings.allowed_exit_code
188187
self._allowed_exit_message = program_settings.allowed_exit_message
189-
self.closefd = closefd
190188
if mode not in ("r", "rb", "w", "wb", "a", "ab"):
191189
raise ValueError(
192190
f"Mode is '{mode}', but it must be 'r', 'rb', 'w', 'wb', 'a', or 'ab'"
193191
)
194192
if "b" not in mode:
195193
mode += "b"
196-
if hasattr(filename, "read") or hasattr(filename, "write"):
197-
file: BinaryIO = filename # type: ignore
198-
filepath: FilePath = ""
199-
if hasattr(filename, "name"):
200-
filepath = filename.name
201-
else:
202-
file = open(filename, mode) # type: ignore
203-
filepath = filename
194+
self.fileobj, self.closefd = _file_or_path_to_binary_stream(filename, mode)
195+
filepath = filepath_from_path_or_filelike(filename)
204196

205197
if (
206198
compresslevel is not None
@@ -234,7 +226,6 @@ def __init__( # noqa: C901
234226
if sys.platform != "win32":
235227
close_fds = True
236228

237-
self.fileobj = file
238229
self.in_pipe = None
239230
self.in_thread = None
240231
self._feeding = True
@@ -267,7 +258,7 @@ def __init__( # noqa: C901
267258
) # type: ignore
268259
except OSError:
269260
if self.closefd:
270-
file.close()
261+
self.fileobj.close()
271262
raise
272263
assert self.process.stdin is not None
273264
self._file = self.process.stdin # type: ignore
@@ -339,11 +330,11 @@ def close(self) -> None:
339330
if self.in_thread:
340331
self.in_thread.join()
341332
self._file.close()
342-
if self.closefd:
343-
self.fileobj.close()
344333
else:
345334
self._file.close()
346335
self.process.wait()
336+
if self.closefd:
337+
self.fileobj.close()
347338
stderr_message = self._read_error_message()
348339
self._stderr.close()
349340
if not self._error_raised:
@@ -440,30 +431,28 @@ def _open_stdin_or_out(mode: str) -> BinaryIO:
440431
return open(std.fileno(), mode=mode, closefd=False) # type: ignore
441432

442433

443-
def _open_bz2(fileobj: BinaryIO, mode: str, threads: Optional[int], closefd: bool):
434+
def _open_bz2(filename: FileOrPath, mode: str, threads: Optional[int]):
444435
assert "b" in mode
445436
if threads != 0:
446437
try:
447438
# pbzip2 can compress using multiple cores.
448439
return _PipedCompressionProgram(
449-
fileobj,
440+
filename,
450441
mode,
451442
threads=threads,
452443
program_settings=_PROGRAM_SETTINGS["pbzip2"],
453-
closefd=closefd,
454444
)
455445
except OSError:
456446
pass # We try without threads.
457447

458-
return bz2.open(fileobj, mode)
448+
return bz2.open(filename, mode)
459449

460450

461451
def _open_xz(
462-
fileobj: BinaryIO,
452+
filename: FileOrPath,
463453
mode: str,
464454
compresslevel: Optional[int],
465455
threads: Optional[int],
466-
closefd: bool,
467456
):
468457
assert "b" in mode
469458
if compresslevel is None:
@@ -473,24 +462,23 @@ def _open_xz(
473462
try:
474463
# xz can compress using multiple cores.
475464
return _PipedCompressionProgram(
476-
fileobj, mode, compresslevel, threads, _PROGRAM_SETTINGS["xz"], closefd
465+
filename, mode, compresslevel, threads, _PROGRAM_SETTINGS["xz"],
477466
)
478467
except OSError:
479468
pass # We try without threads.
480469

481470
return lzma.open(
482-
fileobj,
471+
filename,
483472
mode,
484473
preset=compresslevel if "w" in mode else None,
485474
)
486475

487476

488477
def _open_zst( # noqa: C901
489-
fileobj: BinaryIO,
478+
filename: FileOrPath,
490479
mode: str,
491480
compresslevel: Optional[int],
492481
threads: Optional[int],
493-
closefd: bool,
494482
):
495483
assert "b" in mode
496484
assert compresslevel != 0
@@ -500,12 +488,11 @@ def _open_zst( # noqa: C901
500488
try:
501489
# zstd can compress using multiple cores
502490
return _PipedCompressionProgram(
503-
fileobj,
491+
filename,
504492
mode,
505493
compresslevel,
506494
threads,
507495
_PROGRAM_SETTINGS["zstd"],
508-
closefd,
509496
)
510497
except OSError:
511498
if zstandard is None:
@@ -518,15 +505,15 @@ def _open_zst( # noqa: C901
518505
cctx = zstandard.ZstdCompressor(level=compresslevel)
519506
else:
520507
cctx = None
521-
f = zstandard.open(fileobj, mode, cctx=cctx)
508+
f = zstandard.open(filename, mode, cctx=cctx)
522509
if mode == "rb":
523510
return io.BufferedReader(f)
524511
elif mode == "wb":
525512
return io.BufferedWriter(f)
526513
return f
527514

528515

529-
def _open_gz(fileobj: BinaryIO, mode: str, compresslevel, threads, closefd):
516+
def _open_gz(filename: FileOrPath, mode: str, compresslevel, threads):
530517
"""
531518
Open a gzip file. The ISA-L library is preferred when applicable because
532519
it is the fastest. Then zlib-ng which is not as fast, but supports all
@@ -547,15 +534,15 @@ def _open_gz(fileobj: BinaryIO, mode: str, compresslevel, threads, closefd):
547534
# and level 3 is slower but does not compress better than level 1 and 2.
548535
if igzip_threaded and (compresslevel in (1, 2) or "r" in mode):
549536
return igzip_threaded.open( # type: ignore
550-
fileobj,
537+
filename,
551538
mode,
552539
compresslevel,
553540
threads=1,
554541
)
555542
if gzip_ng_threaded and zlib_ng:
556543
try:
557544
return gzip_ng_threaded.open(
558-
fileobj,
545+
filename,
559546
mode,
560547
# zlib-ng level 1 is 50% bigger than zlib level 1. Level
561548
# 2 gives a size close to expectations.
@@ -568,28 +555,28 @@ def _open_gz(fileobj: BinaryIO, mode: str, compresslevel, threads, closefd):
568555
for program in ("pigz", "gzip"):
569556
try:
570557
return _PipedCompressionProgram(
571-
fileobj,
558+
filename,
572559
mode,
573560
compresslevel,
574561
threads,
575562
_PROGRAM_SETTINGS[program],
576-
closefd,
577563
)
578564
except OSError:
579565
pass # We try without threads.
580566
return _open_reproducible_gzip(
581-
fileobj, mode=mode, compresslevel=compresslevel, closefd=closefd
567+
filename, mode=mode, compresslevel=compresslevel
582568
)
583569

584570

585-
def _open_reproducible_gzip(fileobj, mode: str, compresslevel: int, closefd):
571+
def _open_reproducible_gzip(filename, mode: str, compresslevel: int):
586572
"""
587573
Open a gzip file for writing (without external processes)
588574
that has neither mtime nor the file name in the header
589575
(equivalent to gzip --no-name)
590576
"""
591577
assert mode in ("rb", "wb", "ab")
592578
assert compresslevel is not None
579+
fileobj, closefd = _file_or_path_to_binary_stream(filename, mode)
593580
# Neither gzip.open nor igzip.open have an mtime option, and they will
594581
# always write the file name, so we need to open the file separately
595582
# and pass it to gzip.GzipFile/igzip.IGzipFile.
@@ -618,35 +605,40 @@ def _open_reproducible_gzip(fileobj, mode: str, compresslevel: int, closefd):
618605
return gzip_file
619606

620607

621-
def _detect_format_from_content(fileobj: BinaryIO) -> Optional[str]:
608+
def _detect_format_from_content(filename: FileOrPath) -> Optional[str]:
622609
"""
623610
Attempts to detect file format from the content by reading the first
624611
6 bytes. Returns None if no format could be detected.
625612
"""
626-
if not fileobj.readable():
627-
return None
628-
if hasattr(fileobj, "peek"):
629-
bs = fileobj.peek(6)
630-
elif hasattr(fileobj, "seekable") and fileobj.seekable():
631-
current_pos = fileobj.tell()
632-
bs = fileobj.read(6)
633-
fileobj.seek(current_pos)
634-
else:
613+
fileobj, closefd = _file_or_path_to_binary_stream(filename, "rb")
614+
try:
615+
if not fileobj.readable():
616+
return None
617+
if hasattr(fileobj, "peek"):
618+
bs = fileobj.peek(6)
619+
elif hasattr(fileobj, "seekable") and fileobj.seekable():
620+
current_pos = fileobj.tell()
621+
bs = fileobj.read(6)
622+
fileobj.seek(current_pos)
623+
else:
624+
return None
625+
626+
if bs[:2] == b"\x1f\x8b":
627+
# https://tools.ietf.org/html/rfc1952#page-6
628+
return "gz"
629+
elif bs[:3] == b"\x42\x5a\x68":
630+
# https://en.wikipedia.org/wiki/List_of_file_signatures
631+
return "bz2"
632+
elif bs[:6] == b"\xfd\x37\x7a\x58\x5a\x00":
633+
# https://tukaani.org/xz/xz-file-format.txt
634+
return "xz"
635+
elif bs[:4] == b"\x28\xb5\x2f\xfd":
636+
# https://datatracker.ietf.org/doc/html/rfc8478#section-3.1.1
637+
return "zst"
635638
return None
636-
637-
if bs[:2] == b"\x1f\x8b":
638-
# https://tools.ietf.org/html/rfc1952#page-6
639-
return "gz"
640-
elif bs[:3] == b"\x42\x5a\x68":
641-
# https://en.wikipedia.org/wiki/List_of_file_signatures
642-
return "bz2"
643-
elif bs[:6] == b"\xfd\x37\x7a\x58\x5a\x00":
644-
# https://tukaani.org/xz/xz-file-format.txt
645-
return "xz"
646-
elif bs[:4] == b"\x28\xb5\x2f\xfd":
647-
# https://datatracker.ietf.org/doc/html/rfc8478#section-3.1.1
648-
return "zst"
649-
return None
639+
finally:
640+
if closefd:
641+
fileobj.close()
650642

651643

652644
def _detect_format_from_extension(filename: Union[str, bytes]) -> Optional[str]:
@@ -664,32 +656,43 @@ def _detect_format_from_extension(filename: Union[str, bytes]) -> Optional[str]:
664656
return None
665657

666658

667-
def _file_or_path_to_name_and_binary_stream(
659+
def _file_or_path_to_binary_stream(
668660
file_or_path: FileOrPath, binary_mode: str
669-
) -> Tuple[str, BinaryIO, bool]:
661+
) -> Tuple[BinaryIO, bool]:
670662
if binary_mode not in ("rb", "wb", "ab"):
671663
raise AssertionError()
672664
if file_or_path == "-":
673-
return "", _open_stdin_or_out(binary_mode), False
674-
if isinstance(file_or_path, (str, bytes, os.PathLike)):
665+
return _open_stdin_or_out(binary_mode), False
666+
try:
675667
filepath = os.fspath(file_or_path)
676-
if isinstance(filepath, bytes):
677-
filepath = filepath.decode()
678-
return filepath, open(os.fspath(file_or_path), binary_mode), True # type: ignore
668+
except TypeError:
669+
pass
670+
else:
671+
return open(filepath, binary_mode), True # type: ignore
679672
if isinstance(file_or_path, (io.BufferedReader, io.BufferedWriter)):
680-
return file_or_path.name, file_or_path, False
673+
return file_or_path, False
681674
if isinstance(file_or_path, io.TextIOWrapper):
682-
return file_or_path.name, file_or_path.buffer, False
675+
return file_or_path.buffer, False
683676
if isinstance(file_or_path, io.IOBase) and not hasattr(file_or_path, "encoding"):
684677
# Text files have encoding attributes. This file is binary:
685-
return "", file_or_path, False
678+
return file_or_path, False
686679
else:
687680
raise TypeError(
688681
f"Unsupported type for {file_or_path}, "
689682
f"{file_or_path.__class__.__name__}."
690683
)
691684

692685

686+
def filepath_from_path_or_filelike(fileorpath: FileOrPath):
687+
try:
688+
return os.fspath(fileorpath)
689+
except TypeError:
690+
pass
691+
if hasattr(fileorpath, "name"):
692+
return fileorpath.name
693+
return ""
694+
695+
693696
@overload
694697
def xopen(
695698
filename: FileOrPath,
@@ -780,29 +783,27 @@ def xopen( # noqa: C901 # The function is complex, but readable.
780783
if mode not in ("rt", "rb", "wt", "wb", "at", "ab"):
781784
raise ValueError("Mode '{}' not supported".format(mode))
782785
binary_mode = mode[0] + "b"
783-
filepath, fileobj, closefd = _file_or_path_to_name_and_binary_stream(
784-
filename, binary_mode
785-
)
786+
filepath = filepath_from_path_or_filelike(filename)
786787

787788
if format not in (None, "gz", "xz", "bz2", "zst"):
788789
raise ValueError(
789790
f"Format not supported: {format}. "
790791
f"Choose one of: 'gz', 'xz', 'bz2', 'zst'"
791792
)
792793
detected_format = format or _detect_format_from_extension(filepath)
793-
if detected_format is None and "w" not in mode:
794-
detected_format = _detect_format_from_content(fileobj)
794+
if detected_format is None and "r" in mode:
795+
detected_format = _detect_format_from_content(filename)
795796

796797
if detected_format == "gz":
797-
opened_file = _open_gz(fileobj, binary_mode, compresslevel, threads, closefd)
798+
opened_file = _open_gz(filename, binary_mode, compresslevel, threads)
798799
elif detected_format == "xz":
799-
opened_file = _open_xz(fileobj, binary_mode, compresslevel, threads, closefd)
800+
opened_file = _open_xz(filename, binary_mode, compresslevel, threads)
800801
elif detected_format == "bz2":
801-
opened_file = _open_bz2(fileobj, binary_mode, threads, closefd)
802+
opened_file = _open_bz2(filename, binary_mode, threads)
802803
elif detected_format == "zst":
803-
opened_file = _open_zst(fileobj, binary_mode, compresslevel, threads, closefd)
804+
opened_file = _open_zst(filename, binary_mode, compresslevel, threads)
804805
else:
805-
opened_file = fileobj
806+
opened_file, _ = _file_or_path_to_binary_stream(filename, binary_mode)
806807

807808
# The "write" method for GzipFile is very costly. Lots of python calls are
808809
# made. To a lesser extent this is true for LzmaFile and BZ2File. By

0 commit comments

Comments
 (0)