3.2. Read CSV

File paths works also with URLs

3.2.1. SetUp

>>> import pandas as pd
>>>
>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)

3.2.2. Example

>>> DATA = 'https://python3.info/_static/example.csv'
>>>
>>> pd.read_csv(DATA)
  firstname    lastname  age                email   lastlogin  is_active              groups
0     Alice     Apricot   30    alice@example.com  2000-01-01       True         users;staff
1       Bob  Blackthorn   31      bob@example.com  2000-01-02       True         users;staff
2     Carol        Corn   32    carol@example.com  2000-01-03       True               users
3      Dave      Durian   33     dave@example.org  2000-01-04       True               users
4       Eve  Elderberry   34      eve@example.org  2000-01-05       True  users;staff;admins
5   Mallory       Melon   15  mallory@example.net         NaN      False                 NaN

3.2.3. Parameters

delimiter - field separator
header - row number(s) containing column labels and marking the start of the data
names - how to name columns
index_col - which column should be an index
usecols - which columns to use
skiprows - how many rows to skip, from the top
skipfooter - how many rows to skip, from the bottom
nrows - how many rows to read
skip_blank_lines - skip blank lines?
parse_dates - parse dates (convert to dates) values in those columns
chunksize - how many rows to read at once (useful for working with data greater than available RAM)
thousands - thousand separator (comma, period, space or None)
decimal - decimal separator (comma or period)
encoding - file encoding, default: utf-8

>>> def read_csv(filepath_or_buffer, *, sep=..., delimiter=None,
...              header='infer', names=..., index_col=None,
...              usecols=None, dtype=None, engine=None, converters=None,
...              true_values=None, false_values=None, skipinitialspace=False,
...              skiprows=None, skipfooter=0, nrows=None, na_values=None,
...              keep_default_na=True, na_filter=True, verbose=...,
...              skip_blank_lines=True, parse_dates=None,
...              infer_datetime_format=..., keep_date_col=...,
...              date_parser=..., date_format=None, dayfirst=False,
...              cache_dates=True, iterator=False, chunksize=None,
...              compression='infer', thousands=None, decimal='.',
...              lineterminator=None, quotechar='"', quoting=0, doublequote=True,
...              escapechar=None, comment=None, encoding=None,
...              encoding_errors='strict', dialect=None, on_bad_lines='error',
...              delim_whitespace=..., low_memory=True, memory_map=False,
...              float_precision=None, storage_options=None,
...              dtype_backend=...): ...

3.2.4. Parse Dates

DATA = 'https://python3.info/_static/example.csv'
pd.read_csv(DATA, parse_dates=['lastlogin'])

Without parsing dates, the lastlogin column is a string:

>>> DATA = 'https://python3.info/_static/example.csv'
>>>
>>>
>>> pd.read_csv(DATA)
  firstname    lastname  age                email   lastlogin  is_active              groups
0     Alice     Apricot   30    alice@example.com  2000-01-01       True         users;staff
1       Bob  Blackthorn   31      bob@example.com  2000-01-02       True         users;staff
2     Carol        Corn   32    carol@example.com  2000-01-03       True               users
3      Dave      Durian   33     dave@example.org  2000-01-04       True               users
4       Eve  Elderberry   34      eve@example.org  2000-01-05       True  users;staff;admins
5   Mallory       Melon   15  mallory@example.net         NaN      False                 NaN

With parsing dates, the lastlogin column is a datetime:

>>> DATA = 'https://python3.info/_static/example.csv'
>>>
>>> pd.read_csv(DATA, parse_dates=['lastlogin'])
  firstname    lastname  age                email  lastlogin  is_active              groups
0     Alice     Apricot   30    alice@example.com 2000-01-01       True         users;staff
1       Bob  Blackthorn   31      bob@example.com 2000-01-02       True         users;staff
2     Carol        Corn   32    carol@example.com 2000-01-03       True               users
3      Dave      Durian   33     dave@example.org 2000-01-04       True               users
4       Eve  Elderberry   34      eve@example.org 2000-01-05       True  users;staff;admins
5   Mallory       Melon   15  mallory@example.net        NaT      False                 NaN

3.2.5. Header

DATA = 'https://python3.info/_static/example.csv'

Without specifying the header, the first row is used as the header:

>>> DATA = 'https://python3.info/_static/example.csv'
>>>
>>> pd.read_csv(DATA)
  firstname    lastname  age                email   lastlogin  is_active              groups
0     Alice     Apricot   30    alice@example.com  2000-01-01       True         users;staff
1       Bob  Blackthorn   31      bob@example.com  2000-01-02       True         users;staff
2     Carol        Corn   32    carol@example.com  2000-01-03       True               users
3      Dave      Durian   33     dave@example.org  2000-01-04       True               users
4       Eve  Elderberry   34      eve@example.org  2000-01-05       True  users;staff;admins
5   Mallory       Melon   15  mallory@example.net         NaN      False                 NaN

With header=1, the second row is used as the header:

>>> DATA = 'https://python3.info/_static/example.csv'
>>>
>>> pd.read_csv(DATA, header=1)
     Alice     Apricot  30    alice@example.com  2000-01-01   True         users;staff
0      Bob  Blackthorn  31      bob@example.com  2000-01-02   True         users;staff
1    Carol        Corn  32    carol@example.com  2000-01-03   True               users
2     Dave      Durian  33     dave@example.org  2000-01-04   True               users
3      Eve  Elderberry  34      eve@example.org  2000-01-05   True  users;staff;admins
4  Mallory       Melon  15  mallory@example.net         NaN  False                 NaN

3.2.6. Rename Columns

DATA = 'https://python3.info/_static/example.csv'

Without specifying the names, the first row is used as the header:

>>> DATA = 'https://python3.info/_static/example.csv'
>>>
>>> pd.read_csv(DATA)
  firstname    lastname  age                email   lastlogin  is_active              groups
0     Alice     Apricot   30    alice@example.com  2000-01-01       True         users;staff
1       Bob  Blackthorn   31      bob@example.com  2000-01-02       True         users;staff
2     Carol        Corn   32    carol@example.com  2000-01-03       True               users
3      Dave      Durian   33     dave@example.org  2000-01-04       True               users
4       Eve  Elderberry   34      eve@example.org  2000-01-05       True  users;staff;admins
5   Mallory       Melon   15  mallory@example.net         NaN      False                 NaN

With names, the columns are renamed:

>>> DATA = 'https://python3.info/_static/example.csv'
>>>
>>> pd.read_csv(DATA, names=['First Name', 'Last Name', 'Age', 'E-mail', 'Last Login', 'Is Active?', 'Groups'], header=0)
  First Name   Last Name  Age               E-mail  Last Login  Is Active?              Groups
0      Alice     Apricot   30    alice@example.com  2000-01-01        True         users;staff
1        Bob  Blackthorn   31      bob@example.com  2000-01-02        True         users;staff
2      Carol        Corn   32    carol@example.com  2000-01-03        True               users
3       Dave      Durian   33     dave@example.org  2000-01-04        True               users
4        Eve  Elderberry   34      eve@example.org  2000-01-05        True  users;staff;admins
5    Mallory       Melon   15  mallory@example.net         NaN       False                 NaN

3.2.7. Use Columns

DATA = 'https://python3.info/_static/example.csv'
pd.read_csv(DATA, usecols=['firstname', 'lastname', 'age'])

Without specifying the columns, all columns are used:

>>> DATA = 'https://python3.info/_static/example.csv'
>>>
>>> pd.read_csv(DATA)
  firstname    lastname  age                email   lastlogin  is_active              groups
0     Alice     Apricot   30    alice@example.com  2000-01-01       True         users;staff
1       Bob  Blackthorn   31      bob@example.com  2000-01-02       True         users;staff
2     Carol        Corn   32    carol@example.com  2000-01-03       True               users
3      Dave      Durian   33     dave@example.org  2000-01-04       True               users
4       Eve  Elderberry   34      eve@example.org  2000-01-05       True  users;staff;admins
5   Mallory       Melon   15  mallory@example.net         NaN      False                 NaN

With usecols, only the specified columns are used:

>>> DATA = 'https://python3.info/_static/example.csv'
>>>
>>> pd.read_csv(DATA, usecols=['firstname', 'lastname', 'age'])
  firstname    lastname  age
0     Alice     Apricot   30
1       Bob  Blackthorn   31
2     Carol        Corn   32
3      Dave      Durian   33
4       Eve  Elderberry   34
5   Mallory       Melon   15

3.2.8. Index Column

DATA = 'https://python3.info/_static/example.csv'

Without specifying the index column, the default integer index is used:

>>> DATA = 'https://python3.info/_static/example.csv'
>>>
>>> pd.read_csv(DATA)
  firstname    lastname  age                email   lastlogin  is_active              groups
0     Alice     Apricot   30    alice@example.com  2000-01-01       True         users;staff
1       Bob  Blackthorn   31      bob@example.com  2000-01-02       True         users;staff
2     Carol        Corn   32    carol@example.com  2000-01-03       True               users
3      Dave      Durian   33     dave@example.org  2000-01-04       True               users
4       Eve  Elderberry   34      eve@example.org  2000-01-05       True  users;staff;admins
5   Mallory       Melon   15  mallory@example.net         NaN      False                 NaN

Single Index Column:

>>> DATA = 'https://python3.info/_static/example.csv'
>>>
>>> pd.read_csv(DATA, index_col='email')
                    firstname    lastname  age   lastlogin  is_active              groups
email
alice@example.com       Alice     Apricot   30  2000-01-01       True         users;staff
bob@example.com           Bob  Blackthorn   31  2000-01-02       True         users;staff
carol@example.com       Carol        Corn   32  2000-01-03       True               users
dave@example.org         Dave      Durian   33  2000-01-04       True               users
eve@example.org           Eve  Elderberry   34  2000-01-05       True  users;staff;admins
mallory@example.net   Mallory       Melon   15         NaN      False                 NaN

Multi-Index Column:

>>> DATA = 'https://python3.info/_static/example.csv'
>>>
>>> pd.read_csv(DATA, index_col=['firstname', 'lastname'])
                      age                email   lastlogin  is_active              groups
firstname lastname
Alice     Apricot      30    alice@example.com  2000-01-01       True         users;staff
Bob       Blackthorn   31      bob@example.com  2000-01-02       True         users;staff
Carol     Corn         32    carol@example.com  2000-01-03       True               users
Dave      Durian       33     dave@example.org  2000-01-04       True               users
Eve       Elderberry   34      eve@example.org  2000-01-05       True  users;staff;admins
Mallory   Melon        15  mallory@example.net         NaN      False                 NaN

3.2.9. Encoding

DATA = 'https://python3.info/_static/example.csv'
utf-8 (default)
utf-16
cp1250 or windows-1250
iso-8859-2

With encoding, the file is read with the specified encoding:

>>> DATA = 'https://python3.info/_static/example.csv'
>>>
>>> pd.read_csv(DATA, encoding='utf-8')
  firstname    lastname  age                email   lastlogin  is_active              groups
0     Alice     Apricot   30    alice@example.com  2000-01-01       True         users;staff
1       Bob  Blackthorn   31      bob@example.com  2000-01-02       True         users;staff
2     Carol        Corn   32    carol@example.com  2000-01-03       True               users
3      Dave      Durian   33     dave@example.org  2000-01-04       True               users
4       Eve  Elderberry   34      eve@example.org  2000-01-05       True  users;staff;admins
5   Mallory       Melon   15  mallory@example.net         NaN      False                 NaN

3.2.10. Delimiter

DATA = 'https://python3.info/_static/example.csv'
pd.read_csv(DATA, delimiter=',')

With delimiter, the file is read with the specified delimiter:

>>> DATA = 'https://python3.info/_static/example.csv'
>>>
>>> pd.read_csv(DATA, delimiter=',')
  firstname    lastname  age                email   lastlogin  is_active              groups
0     Alice     Apricot   30    alice@example.com  2000-01-01       True         users;staff
1       Bob  Blackthorn   31      bob@example.com  2000-01-02       True         users;staff
2     Carol        Corn   32    carol@example.com  2000-01-03       True               users
3      Dave      Durian   33     dave@example.org  2000-01-04       True               users
4       Eve  Elderberry   34      eve@example.org  2000-01-05       True  users;staff;admins
5   Mallory       Melon   15  mallory@example.net         NaN      False                 NaN

3.2.11. Thousands

DATA = 'https://python3.info/_static/example.csv'
100000000 - no separator
100,000,000 - English speaking countries
100.000.000 - non-English speaking countries
100 000 000 - international recommendation
100'000'000 - apostrophe separator
10,00,00,000 - Indian numbering system

3.2.12. Decimal

DATA = 'https://python3.info/_static/example.csv'
1.00 - dot separator (English speaking countries)
1,00 - comma separator (non-English speaking countries)

3.2.13. Compressed

DATA = 'https://python3.info/_static/example.zip'
pd.read_csv(DATA, compression='zip')
pd.read_csv(DATA, compression='infer')
If the extension is .gz, .bz2, .zip, and .xz, the corresponding compression method is automatically selected

>>> DATA = 'https://python3.info/_static/example.zip'
>>>
>>> df = pd.read_csv(DATA, compression='zip')
>>> df = pd.read_csv(DATA, compression='infer')

3.2.14. Use Case - 1

>>> DATA = 'https://python3.info/_static/iris-clean.csv'
>>>
>>> header = pd.read_csv(DATA, nrows=0).columns
>>>
>>> list(header)
['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

Label Encoder:

>>> DATA = 'https://python3.info/_static/iris-clean.csv'
>>>
>>> header = pd.read_csv(DATA, nrows=0)
>>> nrows, nvalues, *labels = header.columns
>>> decoder = dict(enumerate(labels))
>>>
>>> decoder
{0: 'petal_length', 1: 'petal_width', 2: 'species'}

3.2.15. Use Case - 2

>>> DATA = 'https://python3.info/_static/iris-dirty.csv'
>>>
>>> COLUMNS =  [
...     'sepal_length',
...     'sepal_width',
...     'petal_length',
...     'petal_width',
...     'species',
... ]

>>> header = pd.read_csv(DATA, nrows=0)
>>> nrows, ncols, *class_labels = header.columns
>>> label_encoder = dict(enumerate(class_labels))
>>>
>>> label_encoder
{0: 'setosa', 1: 'versicolor', 2: 'virginica'}

>>> df = (
...     pd
...     .read_csv(DATA, names=COLUMNS, skiprows=1)
...     .replace({'species':label_encoder})
...     .head(n=5)
... )
>>>
>>> df
   sepal_length  sepal_width  petal_length  petal_width     species
0           5.4          3.9           1.3          0.4      setosa
1           5.9          3.0           5.1          1.8   virginica
2           6.0          3.4           4.5          1.6  versicolor
3           7.3          2.9           6.3          1.8   virginica
4           5.6          2.5           3.9          1.1  versicolor

3.2.16. Use Case - 3

Name,         Long,       Lat,        ModuleType
"ESA EAC",    50.8524881, 7.1315254,  Indoor

Date,         Time,       Temperature, Humidity, CO2, Noise, Pressure
"2000-01-01", "00:00:00", 22.6,        46,       981, 32,    1019.1
"2000-01-01", "00:05:00", 22.6,        46,       981, 31,    1019.1
"2000-01-01", "00:10:00", 22.6,        46,       968, 32,    1019.1

Name;Long;Lat;ModuleName;ModuleType
"European Astronaut Centre";50.8524881,7.1315254;;Indoor
;;;;;;
Timestamp;"Timezone : Europe/Berlin";Temperature;Humidity;CO2;Noise;Pressure
1622498702;"2021/06/01 00:05:02";22.6;46;981;32;1019.1
1622499004;"2021/06/01 00:10:04";22.6;46;981;31;1019.1
1622499306;"2021/06/01 00:15:06";22.6;46;968;32;1019.1
1622499608;"2021/06/01 00:20:08";22.5;46;940;31;1019.1
1622499912;"2021/06/01 00:25:12";22.5;46;907;32;1019
1622500214;"2021/06/01 00:30:14";22.5;46;877;31;1019
1622500517;"2021/06/01 00:35:17";22.4;46;873;32;1019

>>> DATA= """Name;Long;Lat;ModuleName;ModuleType
... "European Astronaut Centre";50.8524881,7.1315254;;Indoor
... ;;;;;;
... Timestamp;"Timezone : Europe/Berlin";Temperature;Humidity;CO2;Noise;Pressure
... 1622498702;"2021/06/01 00:05:02";22.6;46;981;32;1019.1
... 1622499004;"2021/06/01 00:10:04";22.6;46;981;31;1019.1
... 1622499306;"2021/06/01 00:15:06";22.6;46;968;32;1019.1
... 1622499608;"2021/06/01 00:20:08";22.5;46;940;31;1019.1
... 1622499912;"2021/06/01 00:25:12";22.5;46;907;32;1019
... 1622500214;"2021/06/01 00:30:14";22.5;46;877;31;1019
... 1622500517;"2021/06/01 00:35:17";22.4;46;873;32;1019
... """

3.2.17. Assignments

# %% About
# - Name: Pandas ReadCSV Data
# - Difficulty: easy
# - Lines: 1
# - Minutes: 2

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% English
# 1. Read data `DATA` in CSV format to Pandas DataFrame
# 2. Define variable `result` with the solution
# 3. Run doctests - all must succeed

# %% Polish
# 1. Wczytaj dane `DATA` w formacie CSV do Pandas DataFrame
# 2. Zdefiniuj zmienną `result` z rozwiązaniem
# 3. Uruchom doctesty - wszystkie muszą się powieść

# %% Expected
# >>> result
#   firstname    lastname  age                email   lastlogin  is_active              groups
# 0     Alice     Apricot   30    alice@example.com  2000-01-01       True         users;staff
# 1       Bob  Blackthorn   31      bob@example.com  2000-01-02       True         users;staff
# 2     Carol        Corn   32    carol@example.com  2000-01-03       True               users
# 3      Dave      Durian   33     dave@example.org  2000-01-04       True               users
# 4       Eve  Elderberry   34      eve@example.org  2000-01-05       True  users;staff;admins
# 5   Mallory       Melon   15  mallory@example.net         NaN      False                 NaN

# %% Hints
# - `DataFrame.read_csv()`

# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0

>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'

>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'

>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'

>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'

>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)

>>> result  # doctest: +NORMALIZE_WHITESPACE
  firstname    lastname  age                email   lastlogin  is_active              groups
0     Alice     Apricot   30    alice@example.com  2000-01-01       True         users;staff
1       Bob  Blackthorn   31      bob@example.com  2000-01-02       True         users;staff
2     Carol        Corn   32    carol@example.com  2000-01-03       True               users
3      Dave      Durian   33     dave@example.org  2000-01-04       True               users
4       Eve  Elderberry   34      eve@example.org  2000-01-05       True  users;staff;admins
5   Mallory       Melon   15  mallory@example.net         NaN      False                 NaN
"""

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`

# %% Imports
import pandas as pd

# %% Types
result: pd.DataFrame

# %% Data
DATA = 'https://python3.info/_static/example.csv'

# %% Result
result = ...

# %% About
# - Name: Pandas ReadCSV Parse Date
# - Difficulty: easy
# - Lines: 1
# - Minutes: 2

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% English
# 1. Read data `DATA` in CSV format to Pandas DataFrame
# 2. Parse dates in "lastlogin" column
# 3. Define variable `result` with the solution
# 4. Run doctests - all must succeed

# %% Polish
# 1. Wczytaj dane `DATA` w formacie CSV do Pandas DataFrame
# 2. Sparsuj daty w kolumnie "lastlogin"
# 3. Zdefiniuj zmienną `result` z rozwiązaniem
# 4. Uruchom doctesty - wszystkie muszą się powieść

# %% Expected
# >>> result
#   firstname    lastname  age                email  lastlogin  is_active              groups
# 0     Alice     Apricot   30    alice@example.com 2000-01-01       True         users;staff
# 1       Bob  Blackthorn   31      bob@example.com 2000-01-02       True         users;staff
# 2     Carol        Corn   32    carol@example.com 2000-01-03       True               users
# 3      Dave      Durian   33     dave@example.org 2000-01-04       True               users
# 4       Eve  Elderberry   34      eve@example.org 2000-01-05       True  users;staff;admins
# 5   Mallory       Melon   15  mallory@example.net        NaT      False                 NaN

# %% Hints
# - `DataFrame.read_csv(parse_dates=...)`

# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0

>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'

>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'

>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'

>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'

>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)

>>> result.dtypes  # doctest: +NORMALIZE_WHITESPACE
firstname               str
lastname                str
age                   int64
email                   str
lastlogin    datetime64[us]
is_active              bool
groups                  str
dtype: object

>>> result  # doctest: +NORMALIZE_WHITESPACE
  firstname    lastname  age                email  lastlogin  is_active              groups
0     Alice     Apricot   30    alice@example.com 2000-01-01       True         users;staff
1       Bob  Blackthorn   31      bob@example.com 2000-01-02       True         users;staff
2     Carol        Corn   32    carol@example.com 2000-01-03       True               users
3      Dave      Durian   33     dave@example.org 2000-01-04       True               users
4       Eve  Elderberry   34      eve@example.org 2000-01-05       True  users;staff;admins
5   Mallory       Melon   15  mallory@example.net        NaT      False                 NaN
"""

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`

# %% Imports
import pandas as pd

# %% Types
result: pd.DataFrame

# %% Data
DATA = 'https://python3.info/_static/example.csv'

# %% Result
result = ...

# %% About
# - Name: Pandas ReadCSV Encoding
# - Difficulty: easy
# - Lines: 1
# - Minutes: 2

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% English
# 1. Read data `DATA` in CSV format to Pandas DataFrame
# 2. Read data using 'utf-8' encoding
# 3. Define variable `result` with the solution
# 4. Run doctests - all must succeed

# %% Polish
# 1. Wczytaj dane `DATA` w formacie CSV do Pandas DataFrame
# 2. Sparsuj dane używając kodowania 'utf-8'
# 3. Zdefiniuj zmienną `result` z rozwiązaniem
# 4. Uruchom doctesty - wszystkie muszą się powieść

# %% Expected
# >>> result
#   firstname    lastname  age                email   lastlogin  is_active              groups
# 0     Alice     Apricot   30    alice@example.com  2000-01-01       True         users;staff
# 1       Bob  Blackthorn   31      bob@example.com  2000-01-02       True         users;staff
# 2     Carol        Corn   32    carol@example.com  2000-01-03       True               users
# 3      Dave      Durian   33     dave@example.org  2000-01-04       True               users
# 4       Eve  Elderberry   34      eve@example.org  2000-01-05       True  users;staff;admins
# 5   Mallory       Melon   15  mallory@example.net         NaN      False                 NaN

# %% Hints
# - `DataFrame.read_csv(encoding=...)`

# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0

>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'

>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'

>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'

>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'

>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)

>>> result  # doctest: +NORMALIZE_WHITESPACE
  firstname    lastname  age                email   lastlogin  is_active              groups
0     Alice     Apricot   30    alice@example.com  2000-01-01       True         users;staff
1       Bob  Blackthorn   31      bob@example.com  2000-01-02       True         users;staff
2     Carol        Corn   32    carol@example.com  2000-01-03       True               users
3      Dave      Durian   33     dave@example.org  2000-01-04       True               users
4       Eve  Elderberry   34      eve@example.org  2000-01-05       True  users;staff;admins
5   Mallory       Melon   15  mallory@example.net         NaN      False                 NaN
"""

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`

# %% Imports
import pandas as pd

# %% Types
result: pd.DataFrame

# %% Data
DATA = 'https://python3.info/_static/example.csv'

# %% Result
result = ...

# %% About
# - Name: Pandas ReadCSV Skip Rows
# - Difficulty: easy
# - Lines: 1
# - Minutes: 2

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% English
# 1. Read data `DATA` in CSV format to Pandas DataFrame
# 2. Use `NAMES` as column names and skip first row
# 3. Define variable `result` with the solution
# 4. Run doctests - all must succeed

# %% Polish
# 1. Wczytaj dane `DATA` w formacie CSV do Pandas DataFrame
# 2. Użyj `NAMES` jako nazw kolumn i pomiń pierwszy wiersz
# 3. Zdefiniuj zmienną `result` z rozwiązaniem
# 4. Uruchom doctesty - wszystkie muszą się powieść

# %% Expected
# >>> result
#      fname       lname  age                email   lastlogin  is_active              groups
# 0    Alice     Apricot   30    alice@example.com  2000-01-01       True         users;staff
# 1      Bob  Blackthorn   31      bob@example.com  2000-01-02       True         users;staff
# 2    Carol        Corn   32    carol@example.com  2000-01-03       True               users
# 3     Dave      Durian   33     dave@example.org  2000-01-04       True               users
# 4      Eve  Elderberry   34      eve@example.org  2000-01-05       True  users;staff;admins
# 5  Mallory       Melon   15  mallory@example.net         NaN      False                 NaN

# %% Hints
# - `DataFrame.read_csv(names=..., skiprows=...)`

# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0

>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'

>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'

>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'

>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'

>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)

>>> result  # doctest: +NORMALIZE_WHITESPACE
     fname       lname  age                email   lastlogin  is_active              groups
0    Alice     Apricot   30    alice@example.com  2000-01-01       True         users;staff
1      Bob  Blackthorn   31      bob@example.com  2000-01-02       True         users;staff
2    Carol        Corn   32    carol@example.com  2000-01-03       True               users
3     Dave      Durian   33     dave@example.org  2000-01-04       True               users
4      Eve  Elderberry   34      eve@example.org  2000-01-05       True  users;staff;admins
5  Mallory       Melon   15  mallory@example.net         NaN      False                 NaN
"""

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`

# %% Imports
import pandas as pd

# %% Types
result: pd.DataFrame

# %% Data
DATA = 'https://python3.info/_static/example.csv'

NAMES = [
    'fname',
    'lname',
    'age',
    'email',
    'lastlogin',
    'is_active',
    'groups',
]

# %% Result
result = ...

# %% About
# - Name: Pandas ReadCSV N Rows
# - Difficulty: easy
# - Lines: 1
# - Minutes: 2

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% English
# 1. Read data `DATA` in CSV format to Pandas DataFrame
# 2. Read only first 3 rows
# 3. Define variable `result` with the solution
# 4. Run doctests - all must succeed

# %% Polish
# 1. Wczytaj dane `DATA` w formacie CSV do Pandas DataFrame
# 2. Wczytaj tylko pierwsze 3 wiersze
# 3. Zdefiniuj zmienną `result` z rozwiązaniem
# 4. Uruchom doctesty - wszystkie muszą się powieść

# %% Expected
# >>> result
#   firstname    lastname  age              email   lastlogin  is_active       groups
# 0     Alice     Apricot   30  alice@example.com  2000-01-01       True  users;staff
# 1       Bob  Blackthorn   31    bob@example.com  2000-01-02       True  users;staff
# 2     Carol        Corn   32  carol@example.com  2000-01-03       True        users

# %% Hints
# - `DataFrame.read_csv(nrows=...)`

# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0

>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'

>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'

>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'

>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'

>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)

>>> result  # doctest: +NORMALIZE_WHITESPACE
  firstname    lastname  age              email   lastlogin  is_active       groups
0     Alice     Apricot   30  alice@example.com  2000-01-01       True  users;staff
1       Bob  Blackthorn   31    bob@example.com  2000-01-02       True  users;staff
2     Carol        Corn   32  carol@example.com  2000-01-03       True        users
"""

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`

# %% Imports
import pandas as pd

# %% Types
result: pd.DataFrame

# %% Data
DATA = 'https://python3.info/_static/example.csv'

# %% Result
result = ...

# %% About
# - Name: Pandas ReadCSV Use Cols
# - Difficulty: easy
# - Lines: 1
# - Minutes: 2

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% English
# 1. Read data `DATA` in CSV format to Pandas DataFrame
# 2. Read only columns: 'firstname', 'lastname', 'age'
# 3. Define variable `result` with the solution
# 4. Run doctests - all must succeed

# %% Polish
# 1. Wczytaj dane `DATA` w formacie CSV do Pandas DataFrame
# 2. Wczytaj tylko kolumny: 'firstname', 'lastname', 'age'
# 3. Zdefiniuj zmienną `result` z rozwiązaniem
# 4. Uruchom doctesty - wszystkie muszą się powieść

# %% Expected
# >>> result
#   firstname    lastname  age
# 0     Alice     Apricot   30
# 1       Bob  Blackthorn   31
# 2     Carol        Corn   32
# 3      Dave      Durian   33
# 4       Eve  Elderberry   34
# 5   Mallory       Melon   15

# %% Hints
# - `DataFrame.read_csv(usecols=...)`

# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0

>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'

>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'

>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'

>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'

>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)

>>> result  # doctest: +NORMALIZE_WHITESPACE
  firstname    lastname  age
0     Alice     Apricot   30
1       Bob  Blackthorn   31
2     Carol        Corn   32
3      Dave      Durian   33
4       Eve  Elderberry   34
5   Mallory       Melon   15
"""

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`

# %% Imports
import pandas as pd

# %% Types
result: pd.DataFrame

# %% Data
DATA = 'https://python3.info/_static/example.csv'

# %% Result
result = ...

# %% About
# - Name: Pandas ReadCSV Index Col
# - Difficulty: easy
# - Lines: 1
# - Minutes: 2

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% English
# 1. Read data `DATA` in CSV format to Pandas DataFrame
# 2. Set `email` column as index
# 3. Define variable `result` with the solution
# 4. Run doctests - all must succeed

# %% Polish
# 1. Wczytaj dane `DATA` w formacie CSV do Pandas DataFrame
# 2. Ustaw kolumnę `email` jako indeks
# 3. Zdefiniuj zmienną `result` z rozwiązaniem
# 4. Uruchom doctesty - wszystkie muszą się powieść

# %% Expected
# >>> result
#                     firstname    lastname  age   lastlogin  is_active              groups
# email
# alice@example.com       Alice     Apricot   30  2000-01-01       True         users;staff
# bob@example.com           Bob  Blackthorn   31  2000-01-02       True         users;staff
# carol@example.com       Carol        Corn   32  2000-01-03       True               users
# dave@example.org         Dave      Durian   33  2000-01-04       True               users
# eve@example.org           Eve  Elderberry   34  2000-01-05       True  users;staff;admins
# mallory@example.net   Mallory       Melon   15         NaN      False                 NaN

# %% Hints
# - `DataFrame.read_csv(index_col=...)`

# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0

>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'

>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'

>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'

>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'

>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)

>>> result  # doctest: +NORMALIZE_WHITESPACE
                    firstname    lastname  age   lastlogin  is_active              groups
email
alice@example.com       Alice     Apricot   30  2000-01-01       True         users;staff
bob@example.com           Bob  Blackthorn   31  2000-01-02       True         users;staff
carol@example.com       Carol        Corn   32  2000-01-03       True               users
dave@example.org         Dave      Durian   33  2000-01-04       True               users
eve@example.org           Eve  Elderberry   34  2000-01-05       True  users;staff;admins
mallory@example.net   Mallory       Melon   15         NaN      False                 NaN
"""

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`

# %% Imports
import pandas as pd

# %% Types
result: pd.DataFrame

# %% Data
DATA = 'https://python3.info/_static/example.csv'

# %% Result
result = ...

# %% About
# - Name: Pandas ReadCSV Compression
# - Difficulty: easy
# - Lines: 1
# - Minutes: 2

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% English
# 1. Read data `DATA` in CSV format to Pandas DataFrame
# 2. Use `zip` compression when reading the data
# 3. Define variable `result` with the solution
# 4. Run doctests - all must succeed

# %% Polish
# 1. Wczytaj dane `DATA` w formacie CSV do Pandas DataFrame
# 2. Ustaw kompresję `zip` podczas wczytywania danych
# 3. Zdefiniuj zmienną `result` z rozwiązaniem
# 4. Uruchom doctesty - wszystkie muszą się powieść

# %% Expected
# >>> result
#   firstname    lastname  age                email   lastlogin  is_active              groups
# 0     Alice     Apricot   30    alice@example.com  2000-01-01       True         users;staff
# 1       Bob  Blackthorn   31      bob@example.com  2000-01-02       True         users;staff
# 2     Carol        Corn   32    carol@example.com  2000-01-03       True               users
# 3      Dave      Durian   33     dave@example.org  2000-01-04       True               users
# 4       Eve  Elderberry   34      eve@example.org  2000-01-05       True  users;staff;admins
# 5   Mallory       Melon   15  mallory@example.net         NaN      False                 NaN

# %% Hints
# - `DataFrame.read_csv(compression=...)`

# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0

>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'

>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'

>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'

>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'

>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)

>>> result  # doctest: +NORMALIZE_WHITESPACE
  firstname    lastname  age                email   lastlogin  is_active              groups
0     Alice     Apricot   30    alice@example.com  2000-01-01       True         users;staff
1       Bob  Blackthorn   31      bob@example.com  2000-01-02       True         users;staff
2     Carol        Corn   32    carol@example.com  2000-01-03       True               users
3      Dave      Durian   33     dave@example.org  2000-01-04       True               users
4       Eve  Elderberry   34      eve@example.org  2000-01-05       True  users;staff;admins
5   Mallory       Melon   15  mallory@example.net         NaN      False                 NaN
"""

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`

# %% Imports
import pandas as pd

# %% Types
result: pd.DataFrame

# %% Data
DATA = 'https://python3.info/_static/example.zip'

# %% Result
result = ...