Coverage for biobb_dna/utils/loader.py: 63%

27 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-05-07 09:06 +0000

1#!/usr/bin/env python3 

2 

3"""Utility functions to load files.""" 

4import zipfile 

5from pathlib import Path 

6 

7import pandas as pd 

8 

9 

10def read_series(input_serfile, usecols=None): 

11 """Read .ser file""" 

12 extra_kwargs = dict( 

13 header=None, 

14 sep='\\s+', 

15 index_col=0) 

16 ser_data = pd.read_csv(input_serfile, **extra_kwargs) 

17 if usecols is not None: 

18 if 0 in usecols: 

19 usecols.pop(usecols.index(0)) 

20 ser_data = ser_data[[i+1 for i in usecols]] 

21 return ser_data 

22 

23 

24def load_data(data_filename, inner_file=None): 

25 """Read .csv file directly or from inside a .zip file.""" 

26 if Path(data_filename).suffix == ".zip": 

27 zf = zipfile.ZipFile(data_filename, "r") 

28 # use provided data filename of look for csv file 

29 if inner_file is not None: 

30 dataset = zf.open(inner_file) 

31 else: 

32 print( 

33 "inner file name not provided, " 

34 "using first .csv file found inside .zip.") 

35 for fn in zf.infolist(): 

36 if fn.filename.endswith(".csv"): 

37 dataset = zf.open(fn) 

38 break 

39 elif Path(data_filename).suffix == ".csv": 

40 dataset = data_filename 

41 else: 

42 raise IOError("input file extension must be .zip or .csv!") 

43 data = pd.read_csv(dataset, index_col=0) 

44 return data