@@ -462,7 +462,11 @@ def _open_bz2(
462
462
except OSError :
463
463
pass # We try without threads.
464
464
465
- return bz2 .open (filename , mode , compresslevel )
465
+ bz2_file = bz2 .open (filename , mode , compresslevel )
466
+ if "r" in mode :
467
+ return bz2_file
468
+ # Buffer writes on bz2.open to mitigate overhead of small writes
469
+ return io .BufferedWriter (bz2_file ) # type: ignore
466
470
467
471
468
472
def _open_xz (
@@ -488,11 +492,10 @@ def _open_xz(
488
492
except OSError :
489
493
pass # We try without threads.
490
494
491
- return lzma .open (
492
- filename ,
493
- mode ,
494
- preset = compresslevel if "r" not in mode else None ,
495
- )
495
+ if "r" in mode :
496
+ return lzma .open (filename , mode )
497
+ # Buffer writes on lzma.open to mitigate overhead of small writes
498
+ return io .BufferedWriter (lzma .open (filename , mode , preset = compresslevel )) # type: ignore
496
499
497
500
498
501
def _open_zst (
@@ -628,6 +631,9 @@ def _open_reproducible_gzip(filename, mode: str, compresslevel: int):
628
631
# is called. This forces it to be closed.
629
632
if closefd :
630
633
gzip_file .myfileobj = fileobj
634
+ if sys .version_info .major == 3 and sys .version_info .minor < 12 and "r" not in mode :
635
+ # From version 3.12 onwards, gzip is properly internally buffered for writing.
636
+ return io .BufferedWriter (gzip_file ) # type: ignore
631
637
return gzip_file
632
638
633
639
@@ -741,7 +747,7 @@ def xopen(
741
747
...
742
748
743
749
744
- def xopen ( # noqa: C901 # The function is complex, but readable.
750
+ def xopen (
745
751
filename : FileOrPath ,
746
752
mode : Literal ["r" , "w" , "a" , "rt" , "rb" , "wt" , "wb" , "at" , "ab" ] = "r" ,
747
753
compresslevel : Optional [int ] = None ,
@@ -824,18 +830,6 @@ def xopen( # noqa: C901 # The function is complex, but readable.
824
830
else :
825
831
opened_file , _ = _file_or_path_to_binary_stream (filename , binary_mode )
826
832
827
- # The "write" method for GzipFile is very costly. Lots of python calls are
828
- # made. To a lesser extent this is true for LzmaFile and BZ2File. By
829
- # putting a buffer in between, the expensive write method is called much
830
- # less. The effect is very noticeable when writing small units such as
831
- # lines or FASTQ records.
832
- if (
833
- isinstance (opened_file , (gzip .GzipFile , bz2 .BZ2File , lzma .LZMAFile )) # FIXME
834
- and "w" in mode
835
- ):
836
- opened_file = io .BufferedWriter (
837
- opened_file , buffer_size = BUFFER_SIZE # type: ignore
838
- )
839
833
if "t" in mode :
840
834
return io .TextIOWrapper (opened_file , encoding , errors , newline )
841
835
return opened_file
0 commit comments