4.1. Filter for Specific Rows Across a Set of Worksheets BASE PYTHON

In this case, we want to filter for rows from the first and second worksheets where the sale amount is greater than $1,900.00. To select this subset of rows from the first and second worksheets with base Python, type the following code into a text editor and save the file as 11excel_value_meets_condition_set_of_worksheets.py:

>>> #!/usr/bin/env python3
>>> import sys
>>> from datetime import date
>>> from xlrd import open_workbook, xldate_as_tuple
>>> from xlwt import Workbook
>>> input_file = sys.argv[1]
>>> output_file = sys.argv[2]
>>> output_workbook = Workbook()
>>> output_worksheet =output_workbook.add_sheet('set_of_worksheets')
>>> my_sheets = [0,1]
>>> threshold = 1900.0
>>> sales_column_index = 3
>>> first_worksheet = True
>>> with open_workbook(input_file) as workbook:
>>>
>>>     data = []
>>>
>>>     for sheet_index in range(workbook.nsheets):
>>>
>>>         if sheet_index in my_sheets:
>>>
>>>             worksheet =workbook.sheet_by_index(sheet_index)
>>>
>>>             if first_worksheet:
>>>
>>>                 header_row =worksheet.row_values(0)
>>>                 data.append(header_row)
>>>
>>>                 first_worksheet = False
>>>
>>>             for row_index in range(1,worksheet.nrows):
>>>
>>>                 row_list = []
>>>
>>>                 sale_amount =worksheet.cell_value(row_index, sales_column_index)
>>>
>>>                 if sale_amount > threshold:
>>>
>>>                     for column_index in range(worksheet.ncols):
>>>
>>>                         cell_value =worksheet.cell_value(row_index,column_index)
>>>
>>>                         cell_type =worksheet.cell_type(row_index,column_index)
>>>
>>>                         if cell_type == 3:
>>>
>>>                             date_cell =xldate_as_tuple(cell_value,workbook.datemode)
>>>
>>>                             date_cell =date(*date_cell[0:3]).strftime('%m/%d/%Y')
>>>
>>>                             row_list.append(date_cell)
>>>
>>>                         else:
>>>
>>>                             row_list.append(cell_value)
>>>
>>>                 if row_list:
>>>
>>>                     data.append(row_list)
>>>
>>>     for list_index, output_list in enumerate(data):
>>>
>>>         for element_index, element in enumerate(output_list):
>>>
>>>             output_worksheet.write(list_index,element_index, element)
>>> output_workbook.save(output_file)
---------------------------------------------------------------------------

FileNotFoundError                         Traceback (most recent call last)

Cell In [1], line 14
     12 sales_column_index = 3
     13 first_worksheet = True
---> 14 with open_workbook(input_file) as workbook:
     16     data = []
     18     for sheet_index in range(workbook.nsheets):


File /usr/lib/python3/dist-packages/xlrd/__init__.py:111, in open_workbook(filename, logfile, verbosity, use_mmap, file_contents, encoding_override, formatting_info, on_demand, ragged_rows)
    109 else:
    110     filename = os.path.expanduser(filename)
--> 111     with open(filename, "rb") as f:
    112         peek = f.read(peeksz)
    113 if peek == b"PK\x03\x04": # a ZIP file


FileNotFoundError: [Errno 2] No such file or directory: '-f'

Line 10 creates a list variable named my_sheets that contains two integers representing the index values of the worksheets we want to process.

Line 16 creates index values for all of the worksheets in the workbook and applies a for loop over the index values.

Line 17 tests whether the index value being considered in the for loop is one of the index values in my_sheets. This test ensures that we only process the worksheets that we want to process.

Because we’re iterating through worksheet index values, we need to use the workbook’s sheet_by_index function in conjunction with an index value in line 18 to access the current worksheet.

For the first worksheet we want to process, line 19 is True, so we append the header row into data and then set first_worksheet equal to False. Then we process the remaining data rows in a similar fashion, as we did in earlier examples. For the second and subsequent worksheets we want to process, the script moves ahead to line 23 to process the data rows in the worksheet.

To run the script, type the following on the command line and hit Enter:

python 11excel_value_meets_condition_set_of_worksheets.py sales_2013.xlsxoutput_files11output.xls

You can then open the output file, 11output.xls , to review the results.

PANDAS

Pandas makes it easy to select a subset of worksheets in a workbook. You simply specify the index numbers or names of the worksheets as a list in the read_excel function. In this example, we create a list of index numbers named my_sheets and then set sheetname equal to my_sheets inside the read_excel function.

To select a subset of worksheets with pandas, type the following code into a text editor and save the file as pandas_value_meets_condition_set_of_worksheets.py:

>>> #!/usr/bin/env python3
>>> import pandas as pd
>>> import sys
>>> input_file = sys.argv[1]
>>> output_file = sys.argv[2]
>>> my_sheets = [0,1]
>>> threshold = 1900.0
>>> data_frame = pd.read_excel(input_file,sheet_name=my_sheets, index_col=None)
>>> row_list = []
>>> for worksheet_name, data in data_frame.items():
>>>     row_list.append(data[data['SaleAmount'].astype(float) > threshold])
>>> filtered_rows = pd.concat(row_list, axis=0,ignore_index=True)
>>> writer = pd.ExcelWriter(output_file)
>>> filtered_rows.to_excel(writer,sheet_name='set_of_worksheets', index=False)
>>> writer.save()
---------------------------------------------------------------------------

InvalidFileException                      Traceback (most recent call last)

<ipython-input-6-145ad5a5ed81> in <module>
      6 my_sheets = [0,1]
      7 threshold = 1900.0
----> 8 data_frame = pd.read_excel(input_file,sheet_name=my_sheets, index_col=None)
      9 row_list = []
     10 for worksheet_name, data in data_frame.items():


/usr/lib/python3/dist-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
    294                 )
    295                 warnings.warn(msg, FutureWarning, stacklevel=stacklevel)
--> 296             return func(*args, **kwargs)
    297
    298         return wrapper


/usr/lib/python3/dist-packages/pandas/io/excel/_base.py in read_excel(io, sheet_name, header, names, index_col, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, thousands, comment, skipfooter, convert_float, mangle_dupe_cols)
    325
    326     if not isinstance(io, ExcelFile):
--> 327         io = ExcelFile(io, engine=engine)
    328     elif engine and engine != io.engine:
    329         raise ValueError(


/usr/lib/python3/dist-packages/pandas/io/excel/_base.py in __init__(self, path_or_buffer, engine)
    952         self._io = stringify_path(path_or_buffer)
    953
--> 954         self._reader = self._engines[engine](self._io)
    955
    956     def __fspath__(self):


/usr/lib/python3/dist-packages/pandas/io/excel/_openpyxl.py in __init__(self, filepath_or_buffer)
    478         """
    479         import_optional_dependency("openpyxl")
--> 480         super().__init__(filepath_or_buffer)
    481
    482     @property


/usr/lib/python3/dist-packages/pandas/io/excel/_base.py in __init__(self, filepath_or_buffer)
    374             self.book = self.load_workbook(filepath_or_buffer)
    375         elif isinstance(filepath_or_buffer, str):
--> 376             self.book = self.load_workbook(filepath_or_buffer)
    377         elif isinstance(filepath_or_buffer, bytes):
    378             self.book = self.load_workbook(BytesIO(filepath_or_buffer))


/usr/lib/python3/dist-packages/pandas/io/excel/_openpyxl.py in load_workbook(self, filepath_or_buffer)
    489         from openpyxl import load_workbook
    490
--> 491         return load_workbook(
    492             filepath_or_buffer, read_only=True, data_only=True, keep_links=False
    493         )


/usr/lib/python3/dist-packages/openpyxl/reader/excel.py in load_workbook(filename, read_only, keep_vba, data_only, keep_links)
    310
    311     """
--> 312     reader = ExcelReader(filename, read_only, keep_vba,
    313                         data_only, keep_links)
    314     reader.read()


/usr/lib/python3/dist-packages/openpyxl/reader/excel.py in __init__(self, fn, read_only, keep_vba, data_only, keep_links)
    122     def __init__(self,  fn, read_only=False, keep_vba=KEEP_VBA,
    123                   data_only=False, keep_links=True):
--> 124         self.archive = _validate_archive(fn)
    125         self.valid_files = self.archive.namelist()
    126         self.read_only = read_only


/usr/lib/python3/dist-packages/openpyxl/reader/excel.py in _validate_archive(filename)
     92                        'Supported formats are: %s') % (file_format,
     93                                                        ','.join(SUPPORTED_FORMATS))
---> 94             raise InvalidFileException(msg)
     95
     96     archive = ZipFile(filename, 'r')


InvalidFileException: openpyxl does not support  file format, please check you can open it with Excel first. Supported formats are: .xlsx,.xlsm,.xltx,.xltm

To run the script, type the following on the command line and hit Enter:

python pandas_value_meets_condition_set_of_worksheets.pysales_2013.xlsx output_filespandas_output.xls

You can then open the output file, review the results.