@@ -171,7 +171,6 @@ def __init__( # noqa: C901
171
171
compresslevel : Optional [int ] = None ,
172
172
threads : Optional [int ] = None ,
173
173
program_settings : _ProgramSettings = _ProgramSettings (("gzip" , "--no-name" )),
174
- closefd : bool = True ,
175
174
):
176
175
"""
177
176
mode -- one of 'w', 'wb', 'a', 'ab'
@@ -186,21 +185,14 @@ def __init__( # noqa: C901
186
185
self ._program_args = list (program_settings .program_args )
187
186
self ._allowed_exit_code = program_settings .allowed_exit_code
188
187
self ._allowed_exit_message = program_settings .allowed_exit_message
189
- self .closefd = closefd
190
188
if mode not in ("r" , "rb" , "w" , "wb" , "a" , "ab" ):
191
189
raise ValueError (
192
190
f"Mode is '{ mode } ', but it must be 'r', 'rb', 'w', 'wb', 'a', or 'ab'"
193
191
)
194
192
if "b" not in mode :
195
193
mode += "b"
196
- if hasattr (filename , "read" ) or hasattr (filename , "write" ):
197
- file : BinaryIO = filename # type: ignore
198
- filepath : FilePath = ""
199
- if hasattr (filename , "name" ):
200
- filepath = filename .name
201
- else :
202
- file = open (filename , mode ) # type: ignore
203
- filepath = filename
194
+ self .fileobj , self .closefd = _file_or_path_to_binary_stream (filename , mode )
195
+ filepath = filepath_from_path_or_filelike (filename )
204
196
205
197
if (
206
198
compresslevel is not None
@@ -234,7 +226,6 @@ def __init__( # noqa: C901
234
226
if sys .platform != "win32" :
235
227
close_fds = True
236
228
237
- self .fileobj = file
238
229
self .in_pipe = None
239
230
self .in_thread = None
240
231
self ._feeding = True
@@ -267,7 +258,7 @@ def __init__( # noqa: C901
267
258
) # type: ignore
268
259
except OSError :
269
260
if self .closefd :
270
- file .close ()
261
+ self . fileobj .close ()
271
262
raise
272
263
assert self .process .stdin is not None
273
264
self ._file = self .process .stdin # type: ignore
@@ -339,11 +330,11 @@ def close(self) -> None:
339
330
if self .in_thread :
340
331
self .in_thread .join ()
341
332
self ._file .close ()
342
- if self .closefd :
343
- self .fileobj .close ()
344
333
else :
345
334
self ._file .close ()
346
335
self .process .wait ()
336
+ if self .closefd :
337
+ self .fileobj .close ()
347
338
stderr_message = self ._read_error_message ()
348
339
self ._stderr .close ()
349
340
if not self ._error_raised :
@@ -440,30 +431,28 @@ def _open_stdin_or_out(mode: str) -> BinaryIO:
440
431
return open (std .fileno (), mode = mode , closefd = False ) # type: ignore
441
432
442
433
443
- def _open_bz2 (fileobj : BinaryIO , mode : str , threads : Optional [int ], closefd : bool ):
434
+ def _open_bz2 (filename : FileOrPath , mode : str , threads : Optional [int ]):
444
435
assert "b" in mode
445
436
if threads != 0 :
446
437
try :
447
438
# pbzip2 can compress using multiple cores.
448
439
return _PipedCompressionProgram (
449
- fileobj ,
440
+ filename ,
450
441
mode ,
451
442
threads = threads ,
452
443
program_settings = _PROGRAM_SETTINGS ["pbzip2" ],
453
- closefd = closefd ,
454
444
)
455
445
except OSError :
456
446
pass # We try without threads.
457
447
458
- return bz2 .open (fileobj , mode )
448
+ return bz2 .open (filename , mode )
459
449
460
450
461
451
def _open_xz (
462
- fileobj : BinaryIO ,
452
+ filename : FileOrPath ,
463
453
mode : str ,
464
454
compresslevel : Optional [int ],
465
455
threads : Optional [int ],
466
- closefd : bool ,
467
456
):
468
457
assert "b" in mode
469
458
if compresslevel is None :
@@ -473,24 +462,23 @@ def _open_xz(
473
462
try :
474
463
# xz can compress using multiple cores.
475
464
return _PipedCompressionProgram (
476
- fileobj , mode , compresslevel , threads , _PROGRAM_SETTINGS ["xz" ], closefd
465
+ filename , mode , compresslevel , threads , _PROGRAM_SETTINGS ["xz" ],
477
466
)
478
467
except OSError :
479
468
pass # We try without threads.
480
469
481
470
return lzma .open (
482
- fileobj ,
471
+ filename ,
483
472
mode ,
484
473
preset = compresslevel if "w" in mode else None ,
485
474
)
486
475
487
476
488
477
def _open_zst ( # noqa: C901
489
- fileobj : BinaryIO ,
478
+ filename : FileOrPath ,
490
479
mode : str ,
491
480
compresslevel : Optional [int ],
492
481
threads : Optional [int ],
493
- closefd : bool ,
494
482
):
495
483
assert "b" in mode
496
484
assert compresslevel != 0
@@ -500,12 +488,11 @@ def _open_zst( # noqa: C901
500
488
try :
501
489
# zstd can compress using multiple cores
502
490
return _PipedCompressionProgram (
503
- fileobj ,
491
+ filename ,
504
492
mode ,
505
493
compresslevel ,
506
494
threads ,
507
495
_PROGRAM_SETTINGS ["zstd" ],
508
- closefd ,
509
496
)
510
497
except OSError :
511
498
if zstandard is None :
@@ -518,15 +505,15 @@ def _open_zst( # noqa: C901
518
505
cctx = zstandard .ZstdCompressor (level = compresslevel )
519
506
else :
520
507
cctx = None
521
- f = zstandard .open (fileobj , mode , cctx = cctx )
508
+ f = zstandard .open (filename , mode , cctx = cctx )
522
509
if mode == "rb" :
523
510
return io .BufferedReader (f )
524
511
elif mode == "wb" :
525
512
return io .BufferedWriter (f )
526
513
return f
527
514
528
515
529
- def _open_gz (fileobj : BinaryIO , mode : str , compresslevel , threads , closefd ):
516
+ def _open_gz (filename : FileOrPath , mode : str , compresslevel , threads ):
530
517
"""
531
518
Open a gzip file. The ISA-L library is preferred when applicable because
532
519
it is the fastest. Then zlib-ng which is not as fast, but supports all
@@ -547,15 +534,15 @@ def _open_gz(fileobj: BinaryIO, mode: str, compresslevel, threads, closefd):
547
534
# and level 3 is slower but does not compress better than level 1 and 2.
548
535
if igzip_threaded and (compresslevel in (1 , 2 ) or "r" in mode ):
549
536
return igzip_threaded .open ( # type: ignore
550
- fileobj ,
537
+ filename ,
551
538
mode ,
552
539
compresslevel ,
553
540
threads = 1 ,
554
541
)
555
542
if gzip_ng_threaded and zlib_ng :
556
543
try :
557
544
return gzip_ng_threaded .open (
558
- fileobj ,
545
+ filename ,
559
546
mode ,
560
547
# zlib-ng level 1 is 50% bigger than zlib level 1. Level
561
548
# 2 gives a size close to expectations.
@@ -568,28 +555,28 @@ def _open_gz(fileobj: BinaryIO, mode: str, compresslevel, threads, closefd):
568
555
for program in ("pigz" , "gzip" ):
569
556
try :
570
557
return _PipedCompressionProgram (
571
- fileobj ,
558
+ filename ,
572
559
mode ,
573
560
compresslevel ,
574
561
threads ,
575
562
_PROGRAM_SETTINGS [program ],
576
- closefd ,
577
563
)
578
564
except OSError :
579
565
pass # We try without threads.
580
566
return _open_reproducible_gzip (
581
- fileobj , mode = mode , compresslevel = compresslevel , closefd = closefd
567
+ filename , mode = mode , compresslevel = compresslevel
582
568
)
583
569
584
570
585
- def _open_reproducible_gzip (fileobj , mode : str , compresslevel : int , closefd ):
571
+ def _open_reproducible_gzip (filename , mode : str , compresslevel : int ):
586
572
"""
587
573
Open a gzip file for writing (without external processes)
588
574
that has neither mtime nor the file name in the header
589
575
(equivalent to gzip --no-name)
590
576
"""
591
577
assert mode in ("rb" , "wb" , "ab" )
592
578
assert compresslevel is not None
579
+ fileobj , closefd = _file_or_path_to_binary_stream (filename , mode )
593
580
# Neither gzip.open nor igzip.open have an mtime option, and they will
594
581
# always write the file name, so we need to open the file separately
595
582
# and pass it to gzip.GzipFile/igzip.IGzipFile.
@@ -618,35 +605,40 @@ def _open_reproducible_gzip(fileobj, mode: str, compresslevel: int, closefd):
618
605
return gzip_file
619
606
620
607
621
- def _detect_format_from_content (fileobj : BinaryIO ) -> Optional [str ]:
608
+ def _detect_format_from_content (filename : FileOrPath ) -> Optional [str ]:
622
609
"""
623
610
Attempts to detect file format from the content by reading the first
624
611
6 bytes. Returns None if no format could be detected.
625
612
"""
626
- if not fileobj .readable ():
627
- return None
628
- if hasattr (fileobj , "peek" ):
629
- bs = fileobj .peek (6 )
630
- elif hasattr (fileobj , "seekable" ) and fileobj .seekable ():
631
- current_pos = fileobj .tell ()
632
- bs = fileobj .read (6 )
633
- fileobj .seek (current_pos )
634
- else :
613
+ fileobj , closefd = _file_or_path_to_binary_stream (filename , "rb" )
614
+ try :
615
+ if not fileobj .readable ():
616
+ return None
617
+ if hasattr (fileobj , "peek" ):
618
+ bs = fileobj .peek (6 )
619
+ elif hasattr (fileobj , "seekable" ) and fileobj .seekable ():
620
+ current_pos = fileobj .tell ()
621
+ bs = fileobj .read (6 )
622
+ fileobj .seek (current_pos )
623
+ else :
624
+ return None
625
+
626
+ if bs [:2 ] == b"\x1f \x8b " :
627
+ # https://tools.ietf.org/html/rfc1952#page-6
628
+ return "gz"
629
+ elif bs [:3 ] == b"\x42 \x5a \x68 " :
630
+ # https://en.wikipedia.org/wiki/List_of_file_signatures
631
+ return "bz2"
632
+ elif bs [:6 ] == b"\xfd \x37 \x7a \x58 \x5a \x00 " :
633
+ # https://tukaani.org/xz/xz-file-format.txt
634
+ return "xz"
635
+ elif bs [:4 ] == b"\x28 \xb5 \x2f \xfd " :
636
+ # https://datatracker.ietf.org/doc/html/rfc8478#section-3.1.1
637
+ return "zst"
635
638
return None
636
-
637
- if bs [:2 ] == b"\x1f \x8b " :
638
- # https://tools.ietf.org/html/rfc1952#page-6
639
- return "gz"
640
- elif bs [:3 ] == b"\x42 \x5a \x68 " :
641
- # https://en.wikipedia.org/wiki/List_of_file_signatures
642
- return "bz2"
643
- elif bs [:6 ] == b"\xfd \x37 \x7a \x58 \x5a \x00 " :
644
- # https://tukaani.org/xz/xz-file-format.txt
645
- return "xz"
646
- elif bs [:4 ] == b"\x28 \xb5 \x2f \xfd " :
647
- # https://datatracker.ietf.org/doc/html/rfc8478#section-3.1.1
648
- return "zst"
649
- return None
639
+ finally :
640
+ if closefd :
641
+ fileobj .close ()
650
642
651
643
652
644
def _detect_format_from_extension (filename : Union [str , bytes ]) -> Optional [str ]:
@@ -664,32 +656,43 @@ def _detect_format_from_extension(filename: Union[str, bytes]) -> Optional[str]:
664
656
return None
665
657
666
658
667
- def _file_or_path_to_name_and_binary_stream (
659
+ def _file_or_path_to_binary_stream (
668
660
file_or_path : FileOrPath , binary_mode : str
669
- ) -> Tuple [str , BinaryIO , bool ]:
661
+ ) -> Tuple [BinaryIO , bool ]:
670
662
if binary_mode not in ("rb" , "wb" , "ab" ):
671
663
raise AssertionError ()
672
664
if file_or_path == "-" :
673
- return "" , _open_stdin_or_out (binary_mode ), False
674
- if isinstance ( file_or_path , ( str , bytes , os . PathLike )) :
665
+ return _open_stdin_or_out (binary_mode ), False
666
+ try :
675
667
filepath = os .fspath (file_or_path )
676
- if isinstance (filepath , bytes ):
677
- filepath = filepath .decode ()
678
- return filepath , open (os .fspath (file_or_path ), binary_mode ), True # type: ignore
668
+ except TypeError :
669
+ pass
670
+ else :
671
+ return open (filepath , binary_mode ), True # type: ignore
679
672
if isinstance (file_or_path , (io .BufferedReader , io .BufferedWriter )):
680
- return file_or_path . name , file_or_path , False
673
+ return file_or_path , False
681
674
if isinstance (file_or_path , io .TextIOWrapper ):
682
- return file_or_path .name , file_or_path . buffer , False
675
+ return file_or_path .buffer , False
683
676
if isinstance (file_or_path , io .IOBase ) and not hasattr (file_or_path , "encoding" ):
684
677
# Text files have encoding attributes. This file is binary:
685
- return "" , file_or_path , False
678
+ return file_or_path , False
686
679
else :
687
680
raise TypeError (
688
681
f"Unsupported type for { file_or_path } , "
689
682
f"{ file_or_path .__class__ .__name__ } ."
690
683
)
691
684
692
685
686
+ def filepath_from_path_or_filelike (fileorpath : FileOrPath ):
687
+ try :
688
+ return os .fspath (fileorpath )
689
+ except TypeError :
690
+ pass
691
+ if hasattr (fileorpath , "name" ):
692
+ return fileorpath .name
693
+ return ""
694
+
695
+
693
696
@overload
694
697
def xopen (
695
698
filename : FileOrPath ,
@@ -780,29 +783,27 @@ def xopen( # noqa: C901 # The function is complex, but readable.
780
783
if mode not in ("rt" , "rb" , "wt" , "wb" , "at" , "ab" ):
781
784
raise ValueError ("Mode '{}' not supported" .format (mode ))
782
785
binary_mode = mode [0 ] + "b"
783
- filepath , fileobj , closefd = _file_or_path_to_name_and_binary_stream (
784
- filename , binary_mode
785
- )
786
+ filepath = filepath_from_path_or_filelike (filename )
786
787
787
788
if format not in (None , "gz" , "xz" , "bz2" , "zst" ):
788
789
raise ValueError (
789
790
f"Format not supported: { format } . "
790
791
f"Choose one of: 'gz', 'xz', 'bz2', 'zst'"
791
792
)
792
793
detected_format = format or _detect_format_from_extension (filepath )
793
- if detected_format is None and "w" not in mode :
794
- detected_format = _detect_format_from_content (fileobj )
794
+ if detected_format is None and "r" in mode :
795
+ detected_format = _detect_format_from_content (filename )
795
796
796
797
if detected_format == "gz" :
797
- opened_file = _open_gz (fileobj , binary_mode , compresslevel , threads , closefd )
798
+ opened_file = _open_gz (filename , binary_mode , compresslevel , threads )
798
799
elif detected_format == "xz" :
799
- opened_file = _open_xz (fileobj , binary_mode , compresslevel , threads , closefd )
800
+ opened_file = _open_xz (filename , binary_mode , compresslevel , threads )
800
801
elif detected_format == "bz2" :
801
- opened_file = _open_bz2 (fileobj , binary_mode , threads , closefd )
802
+ opened_file = _open_bz2 (filename , binary_mode , threads )
802
803
elif detected_format == "zst" :
803
- opened_file = _open_zst (fileobj , binary_mode , compresslevel , threads , closefd )
804
+ opened_file = _open_zst (filename , binary_mode , compresslevel , threads )
804
805
else :
805
- opened_file = fileobj
806
+ opened_file , _ = _file_or_path_to_binary_stream ( filename , binary_mode )
806
807
807
808
# The "write" method for GzipFile is very costly. Lots of python calls are
808
809
# made. To a lesser extent this is true for LzmaFile and BZ2File. By
0 commit comments