3.1. Filter for Specific Rows Across All Worksheets¶
BASE PYTHON To filter for all of the rows in all of the worksheets where the sale amount is greater than $2,000.00 with base Python, type the following code into a text editor and save the file as 9excel_value_meets_condition_all_worksheets.py :
>>> #!/usr/bin/env python3
>>> import sys
>>> from datetime import date
>>> from xlrd import open_workbook, xldate_as_tuple
>>> from xlwt import Workbook
>>> input_file = sys.argv[1]
>>> output_file = sys.argv[2]
>>> output_workbook = Workbook()
>>> output_worksheet =output_workbook.add_sheet('filtered_rows_all_worksheets')
>>> sales_column_index = 3
>>> threshold = 2000.0
>>> first_worksheet = True
>>> with open_workbook(input_file) as workbook:
>>>
>>> data = []
>>>
>>> for worksheet in workbook.sheets():
>>>
>>> if first_worksheet:
>>>
>>> header_row = worksheet.row_values(0)
>>>
>>> data.append(header_row)
>>>
>>> first_worksheet = False
>>>
>>> for row_index in range(1,worksheet.nrows):
>>>
>>> row_list = []
>>>
>>> sale_amount = worksheet.cell_value(row_index, sales_column_index)
>>>
>>> if sale_amount > threshold:
>>>
>>> for column_index in range(worksheet.ncols):
>>>
>>> cell_value =worksheet.cell_value(row_index,column_index)
>>>
>>> cell_type =worksheet.cell_type(row_index, column_index)
>>>
>>> if cell_type == 3:
>>>
>>> date_cell =xldate_as_tuple(cell_value,workbook.datemode)
>>>
>>> date_cell =date(*date_cell[0:3]).strftime('%m/%d/%Y')
>>>
>>> row_list.append(date_cell)
>>> else:
>>>
>>> row_list.append(cell_value)
>>>
>>> if row_list:
>>>
>>> data.append(row_list)
>>>
>>> for list_index, output_list in enumerate(data):
>>>
>>> for element_index, element in enumerate(output_list):
>>>
>>> output_worksheet.write(list_index,element_index, element)
>>> output_workbook.save(output_file)
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-4-0c80e7df10d1> in <module>
11 threshold = 2000.0
12 first_worksheet = True
---> 13 with open_workbook(input_file) as workbook:
14
15 data = []
~/.local/lib/python3.9/site-packages/xlrd/__init__.py in open_workbook(filename, logfile, verbosity, use_mmap, file_contents, encoding_override, formatting_info, on_demand, ragged_rows)
109 else:
110 filename = os.path.expanduser(filename)
--> 111 with open(filename, "rb") as f:
112 peek = f.read(peeksz)
113 if peek == b"PK\x03\x04": # a ZIP file
FileNotFoundError: [Errno 2] No such file or directory: '-f'
Line 10 creates a variable named sales_column_index to hold the index value of the Sale Amount column. Similarly, line 11 creates a variable named threshold to hold the sale amount we care about. We’ll compare each of the values in the Sale Amount column to this threshold value to determine which rows to write to the output file.
Line 15 creates the for loop we use to iterate through all of the worksheets in the workbook. It uses the workbook object’s sheets attribute to list all of the worksheets in the workbook. Line 16 is True for the first worksheet, so for the first worksheet, we extract the header row, append it into data, and then set first_worksheet equal to False. The code continues and processes the remainingdata rows where the sale amount in the row is greater than the threshold value.
For all of the subsequent worksheets, first_worksheet is False, so the script moves ahead to line 20 to process the data rows in each worksheet. You know that it processes the data rows, and not the header row, because the range function starts at one instead of zero.
To run the script, type the following on the command line and hit Enter:
You can then open the output file,9output.xls , to review the results.
PANDAS
Pandas enables you to read all of the worksheets in a workbook at once by specifying sheetname=None in the read_excel function. Pandas reads the worksheets into a dictionary of DataFrames where the key is the worksheet’s name and the value is the worksheet’s data in a DataFrame. So you can evaluate all of the data in the workbook by iterating through the dictionary’s keys and values. When you filter for specific rows in eachDataFrame, the result is a new, filtered DataFrame, so you can create a list of these filtered DataFrames and then concatenate them together into a final DataFrame.
In this example, we want to filter for all of the rows in all of the worksheets where the sale amount is greater than $2,000.00. To filter for these rows with pandas, type the following code into a text editor and save the file as pandas_value_meets_condition_all_worksheets.py:
>>> #!/usr/bin/env python3
>>> import pandas as pd
>>> import sys
>>> input_file = sys.argv[1]
>>> print(input_file)
>>> print(sys)
>>> output_file = sys.argv[2]
>>> data_frame = pd.read_excel(input_file, sheet_name=None,index_col=None)
>>> row_output = []
>>> for worksheet_name, data in data_frame.items():
>>> row_output.append(data[data['SaleAmount'].astype(float) > 2000.0])
>>> filtered_rows = pd.concat(row_output, axis=0,ignore_index=True)
>>> writer = pd.ExcelWriter(output_file)
>>> filtered_rows.to_excel(writer,sheet_name='sale_amount_gt2000', index=False)
>>> writer.save()
-f
<module 'sys' (built-in)>
---------------------------------------------------------------------------
InvalidFileException Traceback (most recent call last)
<ipython-input-7-d94c135bbc9a> in <module>
6 print(sys)
7 output_file = sys.argv[2]
----> 8 data_frame = pd.read_excel(input_file, sheet_name=None,index_col=None)
9 row_output = []
10 for worksheet_name, data in data_frame.items():
/usr/lib/python3/dist-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
294 )
295 warnings.warn(msg, FutureWarning, stacklevel=stacklevel)
--> 296 return func(*args, **kwargs)
297
298 return wrapper
/usr/lib/python3/dist-packages/pandas/io/excel/_base.py in read_excel(io, sheet_name, header, names, index_col, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, thousands, comment, skipfooter, convert_float, mangle_dupe_cols)
325
326 if not isinstance(io, ExcelFile):
--> 327 io = ExcelFile(io, engine=engine)
328 elif engine and engine != io.engine:
329 raise ValueError(
/usr/lib/python3/dist-packages/pandas/io/excel/_base.py in __init__(self, path_or_buffer, engine)
952 self._io = stringify_path(path_or_buffer)
953
--> 954 self._reader = self._engines[engine](self._io)
955
956 def __fspath__(self):
/usr/lib/python3/dist-packages/pandas/io/excel/_openpyxl.py in __init__(self, filepath_or_buffer)
478 """
479 import_optional_dependency("openpyxl")
--> 480 super().__init__(filepath_or_buffer)
481
482 @property
/usr/lib/python3/dist-packages/pandas/io/excel/_base.py in __init__(self, filepath_or_buffer)
374 self.book = self.load_workbook(filepath_or_buffer)
375 elif isinstance(filepath_or_buffer, str):
--> 376 self.book = self.load_workbook(filepath_or_buffer)
377 elif isinstance(filepath_or_buffer, bytes):
378 self.book = self.load_workbook(BytesIO(filepath_or_buffer))
/usr/lib/python3/dist-packages/pandas/io/excel/_openpyxl.py in load_workbook(self, filepath_or_buffer)
489 from openpyxl import load_workbook
490
--> 491 return load_workbook(
492 filepath_or_buffer, read_only=True, data_only=True, keep_links=False
493 )
/usr/lib/python3/dist-packages/openpyxl/reader/excel.py in load_workbook(filename, read_only, keep_vba, data_only, keep_links)
310
311 """
--> 312 reader = ExcelReader(filename, read_only, keep_vba,
313 data_only, keep_links)
314 reader.read()
/usr/lib/python3/dist-packages/openpyxl/reader/excel.py in __init__(self, fn, read_only, keep_vba, data_only, keep_links)
122 def __init__(self, fn, read_only=False, keep_vba=KEEP_VBA,
123 data_only=False, keep_links=True):
--> 124 self.archive = _validate_archive(fn)
125 self.valid_files = self.archive.namelist()
126 self.read_only = read_only
/usr/lib/python3/dist-packages/openpyxl/reader/excel.py in _validate_archive(filename)
92 'Supported formats are: %s') % (file_format,
93 ','.join(SUPPORTED_FORMATS))
---> 94 raise InvalidFileException(msg)
95
96 archive = ZipFile(filename, 'r')
InvalidFileException: openpyxl does not support file format, please check you can open it with Excel first. Supported formats are: .xlsx,.xlsm,.xltx,.xltm
To run the script, type the following on the command line and hit Enter:
You can then open the output file, review the results.