4.1. Filter for Specific Rows Across a Set of Worksheets BASE PYTHON¶
In this case, we want to filter for rows from the first and second worksheets where the sale amount is greater than $1,900.00. To select this subset of rows from the first and second worksheets with base Python, type the following code into a text editor and save the file as 11excel_value_meets_condition_set_of_worksheets.py:
>>> #!/usr/bin/env python3
>>> import sys
>>> from datetime import date
>>> from xlrd import open_workbook, xldate_as_tuple
>>> from xlwt import Workbook
>>> input_file = sys.argv[1]
>>> output_file = sys.argv[2]
>>> output_workbook = Workbook()
>>> output_worksheet =output_workbook.add_sheet('set_of_worksheets')
>>> my_sheets = [0,1]
>>> threshold = 1900.0
>>> sales_column_index = 3
>>> first_worksheet = True
>>> with open_workbook(input_file) as workbook:
>>>
>>> data = []
>>>
>>> for sheet_index in range(workbook.nsheets):
>>>
>>> if sheet_index in my_sheets:
>>>
>>> worksheet =workbook.sheet_by_index(sheet_index)
>>>
>>> if first_worksheet:
>>>
>>> header_row =worksheet.row_values(0)
>>> data.append(header_row)
>>>
>>> first_worksheet = False
>>>
>>> for row_index in range(1,worksheet.nrows):
>>>
>>> row_list = []
>>>
>>> sale_amount =worksheet.cell_value(row_index, sales_column_index)
>>>
>>> if sale_amount > threshold:
>>>
>>> for column_index in range(worksheet.ncols):
>>>
>>> cell_value =worksheet.cell_value(row_index,column_index)
>>>
>>> cell_type =worksheet.cell_type(row_index,column_index)
>>>
>>> if cell_type == 3:
>>>
>>> date_cell =xldate_as_tuple(cell_value,workbook.datemode)
>>>
>>> date_cell =date(*date_cell[0:3]).strftime('%m/%d/%Y')
>>>
>>> row_list.append(date_cell)
>>>
>>> else:
>>>
>>> row_list.append(cell_value)
>>>
>>> if row_list:
>>>
>>> data.append(row_list)
>>>
>>> for list_index, output_list in enumerate(data):
>>>
>>> for element_index, element in enumerate(output_list):
>>>
>>> output_worksheet.write(list_index,element_index, element)
>>> output_workbook.save(output_file)
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In [1], line 14
12 sales_column_index = 3
13 first_worksheet = True
---> 14 with open_workbook(input_file) as workbook:
16 data = []
18 for sheet_index in range(workbook.nsheets):
File /usr/lib/python3/dist-packages/xlrd/__init__.py:111, in open_workbook(filename, logfile, verbosity, use_mmap, file_contents, encoding_override, formatting_info, on_demand, ragged_rows)
109 else:
110 filename = os.path.expanduser(filename)
--> 111 with open(filename, "rb") as f:
112 peek = f.read(peeksz)
113 if peek == b"PK\x03\x04": # a ZIP file
FileNotFoundError: [Errno 2] No such file or directory: '-f'
Line 10 creates a list variable named my_sheets that contains two integers representing the index values of the worksheets we want to process.
Line 16 creates index values for all of the worksheets in the workbook and applies a for loop over the index values.
Line 17 tests whether the index value being considered in the for loop is one of the index values in my_sheets. This test ensures that we only process the worksheets that we want to process.
Because we’re iterating through worksheet index values, we need to use the workbook’s sheet_by_index function in conjunction with an index value in line 18 to access the current worksheet.
For the first worksheet we want to process, line 19 is True, so we append the header row into data and then set first_worksheet equal to False. Then we process the remaining data rows in a similar fashion, as we did in earlier examples. For the second and subsequent worksheets we want to process, the script moves ahead to line 23 to process the data rows in the worksheet.
To run the script, type the following on the command line and hit Enter:
python 11excel_value_meets_condition_set_of_worksheets.py sales_2013.xlsxoutput_files11output.xls
You can then open the output file, 11output.xls , to review the results.
PANDAS
Pandas makes it easy to select a subset of worksheets in a workbook. You simply specify the index numbers or names of the worksheets as a list in the read_excel function. In this example, we create a list of index numbers named my_sheets and then set sheetname equal to my_sheets inside the read_excel function.
To select a subset of worksheets with pandas, type the following code into a text editor and save the file as pandas_value_meets_condition_set_of_worksheets.py:
>>> #!/usr/bin/env python3
>>> import pandas as pd
>>> import sys
>>> input_file = sys.argv[1]
>>> output_file = sys.argv[2]
>>> my_sheets = [0,1]
>>> threshold = 1900.0
>>> data_frame = pd.read_excel(input_file,sheet_name=my_sheets, index_col=None)
>>> row_list = []
>>> for worksheet_name, data in data_frame.items():
>>> row_list.append(data[data['SaleAmount'].astype(float) > threshold])
>>> filtered_rows = pd.concat(row_list, axis=0,ignore_index=True)
>>> writer = pd.ExcelWriter(output_file)
>>> filtered_rows.to_excel(writer,sheet_name='set_of_worksheets', index=False)
>>> writer.save()
---------------------------------------------------------------------------
InvalidFileException Traceback (most recent call last)
<ipython-input-6-145ad5a5ed81> in <module>
6 my_sheets = [0,1]
7 threshold = 1900.0
----> 8 data_frame = pd.read_excel(input_file,sheet_name=my_sheets, index_col=None)
9 row_list = []
10 for worksheet_name, data in data_frame.items():
/usr/lib/python3/dist-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
294 )
295 warnings.warn(msg, FutureWarning, stacklevel=stacklevel)
--> 296 return func(*args, **kwargs)
297
298 return wrapper
/usr/lib/python3/dist-packages/pandas/io/excel/_base.py in read_excel(io, sheet_name, header, names, index_col, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, thousands, comment, skipfooter, convert_float, mangle_dupe_cols)
325
326 if not isinstance(io, ExcelFile):
--> 327 io = ExcelFile(io, engine=engine)
328 elif engine and engine != io.engine:
329 raise ValueError(
/usr/lib/python3/dist-packages/pandas/io/excel/_base.py in __init__(self, path_or_buffer, engine)
952 self._io = stringify_path(path_or_buffer)
953
--> 954 self._reader = self._engines[engine](self._io)
955
956 def __fspath__(self):
/usr/lib/python3/dist-packages/pandas/io/excel/_openpyxl.py in __init__(self, filepath_or_buffer)
478 """
479 import_optional_dependency("openpyxl")
--> 480 super().__init__(filepath_or_buffer)
481
482 @property
/usr/lib/python3/dist-packages/pandas/io/excel/_base.py in __init__(self, filepath_or_buffer)
374 self.book = self.load_workbook(filepath_or_buffer)
375 elif isinstance(filepath_or_buffer, str):
--> 376 self.book = self.load_workbook(filepath_or_buffer)
377 elif isinstance(filepath_or_buffer, bytes):
378 self.book = self.load_workbook(BytesIO(filepath_or_buffer))
/usr/lib/python3/dist-packages/pandas/io/excel/_openpyxl.py in load_workbook(self, filepath_or_buffer)
489 from openpyxl import load_workbook
490
--> 491 return load_workbook(
492 filepath_or_buffer, read_only=True, data_only=True, keep_links=False
493 )
/usr/lib/python3/dist-packages/openpyxl/reader/excel.py in load_workbook(filename, read_only, keep_vba, data_only, keep_links)
310
311 """
--> 312 reader = ExcelReader(filename, read_only, keep_vba,
313 data_only, keep_links)
314 reader.read()
/usr/lib/python3/dist-packages/openpyxl/reader/excel.py in __init__(self, fn, read_only, keep_vba, data_only, keep_links)
122 def __init__(self, fn, read_only=False, keep_vba=KEEP_VBA,
123 data_only=False, keep_links=True):
--> 124 self.archive = _validate_archive(fn)
125 self.valid_files = self.archive.namelist()
126 self.read_only = read_only
/usr/lib/python3/dist-packages/openpyxl/reader/excel.py in _validate_archive(filename)
92 'Supported formats are: %s') % (file_format,
93 ','.join(SUPPORTED_FORMATS))
---> 94 raise InvalidFileException(msg)
95
96 archive = ZipFile(filename, 'r')
InvalidFileException: openpyxl does not support file format, please check you can open it with Excel first. Supported formats are: .xlsx,.xlsm,.xltx,.xltm
To run the script, type the following on the command line and hit Enter:
python pandas_value_meets_condition_set_of_worksheets.pysales_2013.xlsx output_filespandas_output.xls
You can then open the output file, review the results.