Tuesday, March 24, 2020

Python study notes 10: Plot all kinds of graphs in Python

How do we simply plot by histogram?
How do we plot with 2nd y axis?
How do we scatter plot by group in python?
How do we plot heat map by state county in US in python?


Question: How do we simply plot by histogram? and plot with 2nd y axis?
#===========================================================
#simple line plot with auto legend
import matplotlib.pyplot as plt
plt.figure(figsize=(10,8))
plt.plot(data1.gap_250_,data1.ret_mon_12_avg12, '*-',color='green')
plt.plot(data1.gap_250_,data1.ret_mon_24_avg24, 'v-',color='green')
plt.plot(data1.gap_250_,data1.ret_mon_36_avg36, '^-',color='red')
plt.plot(data1.gap_250_,data1.ret_mon_48_avg48, 's-',color='yellow')
plt.plot(data1.gap_250_,data1.ret_mon_60_avg60, 'o-',color='blue')

legend = plt.legend(loc='upper center', shadow=True, fontsize='x-large')
# Put a nicer background color on the legend.
legend.get_frame().set_facecolor('C0')

plt.show()
#==============

#simply plot by histogram
plt.figure(figsize=(10, 7))
series1.hist()
plt.show()

#or use kind=bar for the histogram
plt.figure(figsize=(10, 7));
df.iloc[5].plot(kind='bar');

#use kde plot for density plot:  
ser = pd.Series(np.random.randn(1000))
ser.plot.kde()

f4, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 5))
f4.tight_layout()
series1.hist(ax=ax1)
ax1.set_title('title1');
series2.hist(ax=ax2)
ax2.set_title('title2');
plt.show()
#===========================================================

Plot with 2nd x-axis:
#===========================================================
import matplotlib.pyplot as plt
figure, ax1 = plt.subplots(figsize=(20,10))
ax2 = ax1.twinx()
ax1.plot(df3.row1, df3.close_std, 'g-')
ax2.plot(df3.row1, df3.close_change, 'b-')
#===========================================================

Question: How do we scatter plot by group in python?
#===========================================================
##simply try the plot overlay first 
import matplotlib.pyplot as plt
plt.figure(figsize=(10,8))
plt.scatter(data1.var_x,data1.var_y1,color='green',marker='.')
plt.scatter(data1.var_x,data1.var_y2,color='red',marker='d')
plt.scatter(data1.var_x,data1.var_y2,color='blue',marker='o')
plt.scatter(data1.var_x,data1.var_y2,color='k',marker='*')
plt.show()
#=========================================================== 
#Method 1: use matplotlib to scatter plot by group 
import matplotlib.pylab as plt
groups = data_input.groupby('plot_by_this_var_group')
fig,ax = plt.subplots()  #==fig is the parent figure
fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(nrows=2, ncols=2,figsize=(20, 10))
#== output to 2*2 plots
fig, [ax1, ax2, ax3, ax4] = plt.subplots(nrows=1, ncols=4)
#== output to 1 row *4 columns plots

ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in groups:
    ax.plot(group.x_axis_name, group.y_axis_name, marker='.', 
    linestyle='', ms=12, label=name)
ax.legend()
plt.show()
 
#===========================================================    
Method 2: use seaborn package: a graphic library built on top of Matplotlib 
import seaborn as sns
import matplotlib.pylab as plt
sns.lmplot( x="x_axis_name", y="y_axis_name",palette=["red", "blue", "black", "orange", 
"pupple"], data=data_input, fit_reg=False, hue='plot_by_this_var_group', legend=False)
plt.legend(loc='lower right')
plt.show()
#===========================================================

Simple histogram plot to see the data distribution:
#===========================================================
plt.figure(figsize=(6,4))
plt.hist(train.category, bins=25, density=True, alpha=0.6, color='g')
plt.title("The distribution of the train: categorial variable")
plt.show()  
#===========================================================

We can also fit the one dimensional data via normal distribution:
#===========================================================
data=dataname
var='varname'
from scipy.stats import norm
import matplotlib.pyplot as plt
plt.figure(figsize=(10,8))
# Fit a normal distribution to the data:
mu, std = norm.fit(data[var])
# Plot the histogram.
plt.hist(data[var], bins=25, density=True, alpha=0.6, color='g')
# Plot the PDF.
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
plt.plot(x, p, 'r', linewidth=2)
title = "var : Avg = %.2f,  std = %.2f" % (mu, std)
plt.title(title)

plt.show() 
#===========================================================

Question: How do we plot candlestick for stock data in python?
#===========================================================
import random
from datetime import date
from datetime import timedelta
date=datetime.today()
pdffile=r'C:\Users\directory\test1.pdf'

for x in range(10):
  ran_num=random.randint(30,4001)
  ran_num
  start = date-timedelta(ran_num)
  start
  end=start+timedelta(59)
  end
  quotes = test[(pd.to_datetime(test['date1']) >= start) &(test['symbol']==Symbol)
  & (pd.to_datetime(test['date1'])<= end) ] 
  
  fig, ax = plt.subplots()
  candlestick2_ohlc(ax,quotes['open'],quotes['high'],quotes['low'],
  quotes['close'],colorup='g', colordown='r',width=.8)
  plt.show()
  fig.savefig(pdffile, bbox_inches='tight')
#===========================================================

What if we want to have mulitple plot into several pdf pages in one pdf file? we have to use the package: backend_pdf.PdfPages
#===========================================================
pdffile=r'C:\Users\...\Python\data\test1.pdf'
import matplotlib.backends.backend_pdf
pdf = matplotlib.backends.backend_pdf.PdfPages(pdffile)

for x in range(2):
    ran_num=random.randint(30,4001)
    ran_num
    day_start = date-timedelta(ran_num)
    day_end=day_start+timedelta(59)
    day_end1=day_start+timedelta(69)
    quotes = pred[(pd.to_datetime(pred['date1']) >= day_start) 
       & (pd.to_datetime(pred['date1'])<= day_end) & (pred['symbol']==Symbol) ] 
    quotes=quotes.reset_index(drop=True)
    quotes1 = pred[(pd.to_datetime(pred['date1']) >= day_start) 
       & (pd.to_datetime(pred['date1'])<= day_end1) & (pred['symbol']==Symbol) ] 
    quotes1=quotes1.reset_index(drop=True)
    
    fig1, [[ax1, ax2], [ax3, ax4]] = plt.subplots(nrows=2, ncols=2,figsize=(20, 16))
    #== output to 2*2 plots
    
    ax1.plot(quotes.index,quotes.k,color='green',marker='d',linestyle='-')
    ax1.plot(quotes.index,quotes.d,color='red',marker='.',linestyle='dashed')
    ax1.plot(quotes.index,quotes.k_20,color='c',linestyle='--')
    ax1.plot(quotes.index,quotes.k_80,color='c',linestyle='--')
    ax1.set_ylabel('K D', color='g')
    ax1.title('This is Randome Test {num} '.format(num=x+1))

    ax2.set_ylabel('MACD', color='b')
    ax2.plot(quotes.index,quotes.macd,color='green',marker='d',linestyle='-')
    ax2.plot(quotes.index,quotes.macds,color='red',marker='.',linestyle='dashed')
    ax2.plot(quotes.index,quotes.macdh,color='c',linestyle='--')
    ax2.plot(quotes.index,quotes.ref_0,color='c',linestyle='--')
    
    candlestick2_ohlc(ax4,quotes['open'],quotes['high'],quotes['low'],
                  quotes['close'],colorup='g', colordown='r',width=.8)
    ax4.plot(quotes.index,quotes.close10,color='r',marker='.',linestyle='-')
    ax4.plot(quotes.index,quotes.close22,color='k',marker='.',linestyle='-')
    ax4.plot(quotes.index,quotes.close60,color='b',marker='.',linestyle='-')
    plt.show()
    
    ax1.plot(quotes1.index,quotes1.k,color='green',marker='d',linestyle='-')
    ax1.plot(quotes1.index,quotes1.d,color='red',marker='.',linestyle='dashed')
    ax1.plot(quotes1.index,quotes1.k_20,color='c',linestyle='--')
    ax1.plot(quotes1.index,quotes1.k_80,color='c',linestyle='--')
    ax1.title('This is Randome Validataion {num} '.format(num=x+1))
    ax1.set_ylabel('K D', color='g')
    
    ax2.set_ylabel('MACD', color='b')
    ax2.plot(quotes1.index,quotes1.macd,color='green',marker='d',linestyle='-')
    ax2.plot(quotes1.index,quotes1.macds,color='red',marker='.',linestyle='dashed')
    ax2.plot(quotes1.index,quotes1.macdh,color='c',linestyle='--')
    ax2.plot(quotes1.index,quotes1.ref_0,color='c',linestyle='--')
    
    ax3.plot(quotes1.index,quotes1.k_d,color='black',linestyle='--')
    ax3.plot(quotes1.index,quotes1.ref_0,color='c',linestyle='--')
    #plt.scatter(pred.index,pred.var_y2,color='blue',marker='o')
    #plt.scatter(pred.index,pred.var_y2,color='k',marker='*')
    ax3.set_ylabel('K_D diff', color='black')
    
    candlestick2_ohlc(ax4,quotes1['open'],quotes1['high'],quotes1['low'],
               quotes1['close'],colorup='g', colordown='r',width=.8)
    ax4.plot(quotes1.index,quotes1.close10,color='r',marker='.',linestyle='-')
    ax4.plot(quotes1.index,quotes1.close22,color='k',marker='.',linestyle='-')
    ax4.plot(quotes1.index,quotes1.close60,color='b',marker='.',linestyle='-')
    plt.show()
    
    #fig.savefig(pdffile, bbox_inches='tight')
    for fig in (fig1,fig3): ## will open an empty extra figure :(
        pdf.savefig( fig )

pdf.close() 
#===========================================================

How do we plot heat map by US state or counties in Python?
First to install the package from the commind line:
conda install -c anaconda plotly
conda install -c plotly plotly-geo
conda install -c plotly plotly-orca

#===========================================================
#here is an example plot by state in US:
import gzip,  pickle #use gzip to compress the file.
import _pickle as cPickle #cPickle is much more fast     

import plotly.graph_objects as go

# Load data frame and tidy it.
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2011_us_ag_exports.csv')
df.shape
df.head()

file=r'/Users/*****/onedrive/data_dictionary/state_data'
with gzip.open(file, 'wb') as f_model_pkl:
    pickle.dump(df, f_model_pkl)
    
file=r'/Users/*****/onedrive/data_dictionary/state_data'
test1 = cPickle.load(gzip.open(file, 'rb'), encoding="ASCII")  
test1.head()    

fig = go.Figure(data=go.Choropleth(
    locations=df['code'], # Spatial coordinates
    z = df['total exports'].astype(float), # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'Reds',
    colorbar_title = "Millions USD",
))

fig.update_layout(
    title_text = '2011 US Agriculture Exports by State',
    geo_scope='usa', # limite map scope to USA
)

fig.show(renderer="jpg")
#usually we just use fig.show(), sometimes nothing showing up. 

#to see all different kind of renderers
import plotly.io as pio
pio.renderers
pio.renderer='notebook'
#===========================================================

Here is another example plotting graph by counties in US:
#===========================================================
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)
#counties is the dict here gives the boundaries. 

file=r'/Users/****/onedrive/data_dictionary/county_data'
with gzip.open(file, 'wb') as f_model_pkl:
    pickle.dump(counties, f_model_pkl)
    
file=r'/Users/****/onedrive/data_dictionary/county_data'
counties = cPickle.load(gzip.open(file, 'rb'), encoding="ASCII")   

import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/plotly/datasets/master/fips-unemp-16.csv",
                   dtype={"fips": str})
df.head()
#df has two columns: fips, unemp
import plotly.express as px

fig = px.choropleth(df, geojson=counties, locations='fips', color='unemp',
                           color_continuous_scale="Viridis",
                           range_color=(0, 12),
                           scope="usa",
                           labels={'unemp':'unemployment rate'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show(renderer="jpg")
#===========================================================

Using the GCP Public Datasets to do geospatial analytics, it has the us_counties datasets for your to plot counties dataset.
#===========================================================
import google.auth
from google.cloud import bigquery
from google.cloud import bigquery_storage_v1beta1

# Explicitly create a credentials object. This allows you to use the same
# credentials for both the BigQuery and BigQuery Storage clients, avoiding
# unnecessary API calls to fetch duplicate authentication tokens.
credentials, your_project_id = google.auth.default(
    scopes=["https://www.googleapis.com/auth/cloud-platform"]
)

# Make clients.
bqclient = bigquery.Client(
    credentials=credentials,
    project=your_project_id,
)
bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient(
    credentials=credentials
)
# Download a table.
table = bigquery.TableReference.from_string(
    "bigquery-public-data.utility_us.country_code_iso"
)
rows = bqclient.list_rows(
    table,
    selected_fields=[
        bigquery.SchemaField("country_name", "STRING"),
        bigquery.SchemaField("fips_code", "STRING"),
    ],
)
dataframe = rows.to_dataframe(bqstorage_client=bqstorageclient)
print(dataframe.head())
# Download query results.
query_string = """
SELECT
CONCAT(
    'https://stackoverflow.com/questions/',
    CAST(id as STRING)) as url,
view_count
FROM `bigquery-public-data.stackoverflow.posts_questions`
WHERE tags like '%google-bigquery%'
ORDER BY view_count DESC
"""

dataframe = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
)
print(dataframe.head())
table = bigquery_storage_v1beta1.types.TableReference()
table.project_id = "bigquery-public-data"
table.dataset_id = "new_york_trees"
table.table_id = "tree_species"

# Select columns to read with read options. If no read options are
# specified, the whole table is read.
read_options = bigquery_storage_v1beta1.types.TableReadOptions()
read_options.selected_fields.append("species_common_name")
read_options.selected_fields.append("fall_color")

parent = "projects/{}".format(your_project_id)
session = bqstorageclient.create_read_session(
    table,
    parent,
    read_options=read_options,
    # This API can also deliver data serialized in Apache Avro format.
    # This example leverages Apache Arrow.
    format_=bigquery_storage_v1beta1.enums.DataFormat.ARROW,
    # We use a LIQUID strategy in this example because we only read from a
    # single stream. Consider BALANCED if you're consuming multiple streams
    # concurrently and want more consistent stream sizes.
    sharding_strategy=(
        bigquery_storage_v1beta1.enums.ShardingStrategy.LIQUID
    ),
)

# This example reads from only a single stream. Read from multiple streams
# to fetch data faster. Note that the session may not contain any streams
# if there are no rows to read.
stream = session.streams[0]
position = bigquery_storage_v1beta1.types.StreamPosition(stream=stream)
reader = bqstorageclient.read_rows(position)

# Parse all Avro blocks and create a dataframe. This call requires a
# session, because the session contains the schema for the row blocks.
dataframe = reader.to_dataframe(session)
print(dataframe.head())
#===========================================================


No comments:

Post a Comment

GCP Study notes 13: Architecting with Google Kubernetes Engine: Foundations (courseRA notes)

Architecting with Google Compute Engine Specialization : 4 Courses in this Specialization. 1. Google Cloud Platform Fundamentals: Core In...