## Tuesday, March 24, 2020

### Python study notes 10: Plot all kinds of graphs in Python

How do we simply plot by histogram?
How do we plot with 2nd y axis?
How do we scatter plot by group in python?
How do we plot heat map by state county in US in python?

Question: How do we simply plot by histogram? and plot with 2nd y axis?
``````#===========================================================
#simple line plot with auto legend
import matplotlib.pyplot as plt
plt.figure(figsize=(10,8))
plt.plot(data1.gap_250_,data1.ret_mon_12_avg12, '*-',color='green')
plt.plot(data1.gap_250_,data1.ret_mon_24_avg24, 'v-',color='green')
plt.plot(data1.gap_250_,data1.ret_mon_36_avg36, '^-',color='red')
plt.plot(data1.gap_250_,data1.ret_mon_48_avg48, 's-',color='yellow')
plt.plot(data1.gap_250_,data1.ret_mon_60_avg60, 'o-',color='blue')

legend = plt.legend(loc='upper center', shadow=True, fontsize='x-large')
# Put a nicer background color on the legend.
legend.get_frame().set_facecolor('C0')

plt.show()
#==============

#simply plot by histogram
plt.figure(figsize=(10, 7))
series1.hist()
plt.show()

#or use kind=bar for the histogram
plt.figure(figsize=(10, 7));
df.iloc[5].plot(kind='bar');

#use kde plot for density plot:
ser = pd.Series(np.random.randn(1000))
ser.plot.kde()

f4, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 5))
f4.tight_layout()
series1.hist(ax=ax1)
ax1.set_title('title1');
series2.hist(ax=ax2)
ax2.set_title('title2');
plt.show()
#===========================================================``````

Plot with 2nd x-axis:
``````#===========================================================
import matplotlib.pyplot as plt
figure, ax1 = plt.subplots(figsize=(20,10))
ax2 = ax1.twinx()
ax1.plot(df3.row1, df3.close_std, 'g-')
ax2.plot(df3.row1, df3.close_change, 'b-')
#===========================================================``````

Question: How do we scatter plot by group in python?
``````#===========================================================
##simply try the plot overlay first
import matplotlib.pyplot as plt
plt.figure(figsize=(10,8))
plt.scatter(data1.var_x,data1.var_y1,color='green',marker='.')
plt.scatter(data1.var_x,data1.var_y2,color='red',marker='d')
plt.scatter(data1.var_x,data1.var_y2,color='blue',marker='o')
plt.scatter(data1.var_x,data1.var_y2,color='k',marker='*')
plt.show()
#===========================================================
#Method 1: use matplotlib to scatter plot by group
import matplotlib.pylab as plt
groups = data_input.groupby('plot_by_this_var_group')
fig,ax = plt.subplots()  #==fig is the parent figure
fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(nrows=2, ncols=2,figsize=(20, 10))
#== output to 2*2 plots
fig, [ax1, ax2, ax3, ax4] = plt.subplots(nrows=1, ncols=4)
#== output to 1 row *4 columns plots

ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in groups:
ax.plot(group.x_axis_name, group.y_axis_name, marker='.',
linestyle='', ms=12, label=name)
ax.legend()
plt.show()

#===========================================================
Method 2: use seaborn package: a graphic library built on top of Matplotlib
import seaborn as sns
import matplotlib.pylab as plt
sns.lmplot( x="x_axis_name", y="y_axis_name",palette=["red", "blue", "black", "orange",
"pupple"], data=data_input, fit_reg=False, hue='plot_by_this_var_group', legend=False)
plt.legend(loc='lower right')
plt.show()
#===========================================================``````

Simple histogram plot to see the data distribution:
``````#===========================================================
plt.figure(figsize=(6,4))
plt.hist(train.category, bins=25, density=True, alpha=0.6, color='g')
plt.title("The distribution of the train: categorial variable")
plt.show()
#===========================================================``````

We can also fit the one dimensional data via normal distribution:
``````#===========================================================
data=dataname
var='varname'
from scipy.stats import norm
import matplotlib.pyplot as plt
plt.figure(figsize=(10,8))
# Fit a normal distribution to the data:
mu, std = norm.fit(data[var])
# Plot the histogram.
plt.hist(data[var], bins=25, density=True, alpha=0.6, color='g')
# Plot the PDF.
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
plt.plot(x, p, 'r', linewidth=2)
title = "var : Avg = %.2f,  std = %.2f" % (mu, std)
plt.title(title)

plt.show()
#===========================================================``````

Question: How do we plot candlestick for stock data in python?
``````#===========================================================
import random
from datetime import date
from datetime import timedelta
date=datetime.today()
pdffile=r'C:\Users\directory\test1.pdf'

for x in range(10):
ran_num=random.randint(30,4001)
ran_num
start = date-timedelta(ran_num)
start
end=start+timedelta(59)
end
quotes = test[(pd.to_datetime(test['date1']) >= start) &(test['symbol']==Symbol)
& (pd.to_datetime(test['date1'])<= end) ]

fig, ax = plt.subplots()
candlestick2_ohlc(ax,quotes['open'],quotes['high'],quotes['low'],
quotes['close'],colorup='g', colordown='r',width=.8)
plt.show()
fig.savefig(pdffile, bbox_inches='tight')
#===========================================================``````

What if we want to have mulitple plot into several pdf pages in one pdf file? we have to use the package: backend_pdf.PdfPages
``````#===========================================================
pdffile=r'C:\Users\...\Python\data\test1.pdf'
import matplotlib.backends.backend_pdf
pdf = matplotlib.backends.backend_pdf.PdfPages(pdffile)

for x in range(2):
ran_num=random.randint(30,4001)
ran_num
day_start = date-timedelta(ran_num)
day_end=day_start+timedelta(59)
day_end1=day_start+timedelta(69)
quotes = pred[(pd.to_datetime(pred['date1']) >= day_start)
& (pd.to_datetime(pred['date1'])<= day_end) & (pred['symbol']==Symbol) ]
quotes=quotes.reset_index(drop=True)
#do not use this: quotes=quotes.reset_index(inplace=True) it will return rull!
quotes1 = pred[(pd.to_datetime(pred['date1']) >= day_start)
& (pd.to_datetime(pred['date1'])<= day_end1) & (pred['symbol']==Symbol) ]
quotes1=quotes1.reset_index(drop=True)

fig1, [[ax1, ax2], [ax3, ax4]] = plt.subplots(nrows=2, ncols=2,figsize=(20, 16))
#== output to 2*2 plots

ax1.plot(quotes.index,quotes.k,color='green',marker='d',linestyle='-')
ax1.plot(quotes.index,quotes.d,color='red',marker='.',linestyle='dashed')
ax1.plot(quotes.index,quotes.k_20,color='c',linestyle='--')
ax1.plot(quotes.index,quotes.k_80,color='c',linestyle='--')
ax1.set_ylabel('K D', color='g')
ax1.title('This is Randome Test {num} '.format(num=x+1))

ax2.set_ylabel('MACD', color='b')
ax2.plot(quotes.index,quotes.macd,color='green',marker='d',linestyle='-')
ax2.plot(quotes.index,quotes.macds,color='red',marker='.',linestyle='dashed')
ax2.plot(quotes.index,quotes.macdh,color='c',linestyle='--')
ax2.plot(quotes.index,quotes.ref_0,color='c',linestyle='--')

candlestick2_ohlc(ax4,quotes['open'],quotes['high'],quotes['low'],
quotes['close'],colorup='g', colordown='r',width=.8)
ax4.plot(quotes.index,quotes.close10,color='r',marker='.',linestyle='-')
ax4.plot(quotes.index,quotes.close22,color='k',marker='.',linestyle='-')
ax4.plot(quotes.index,quotes.close60,color='b',marker='.',linestyle='-')
plt.show()

ax1.plot(quotes1.index,quotes1.k,color='green',marker='d',linestyle='-')
ax1.plot(quotes1.index,quotes1.d,color='red',marker='.',linestyle='dashed')
ax1.plot(quotes1.index,quotes1.k_20,color='c',linestyle='--')
ax1.plot(quotes1.index,quotes1.k_80,color='c',linestyle='--')
ax1.title('This is Randome Validataion {num} '.format(num=x+1))
ax1.set_ylabel('K D', color='g')

ax2.set_ylabel('MACD', color='b')
ax2.plot(quotes1.index,quotes1.macd,color='green',marker='d',linestyle='-')
ax2.plot(quotes1.index,quotes1.macds,color='red',marker='.',linestyle='dashed')
ax2.plot(quotes1.index,quotes1.macdh,color='c',linestyle='--')
ax2.plot(quotes1.index,quotes1.ref_0,color='c',linestyle='--')

ax3.plot(quotes1.index,quotes1.k_d,color='black',linestyle='--')
ax3.plot(quotes1.index,quotes1.ref_0,color='c',linestyle='--')
#plt.scatter(pred.index,pred.var_y2,color='blue',marker='o')
#plt.scatter(pred.index,pred.var_y2,color='k',marker='*')
ax3.set_ylabel('K_D diff', color='black')

candlestick2_ohlc(ax4,quotes1['open'],quotes1['high'],quotes1['low'],
quotes1['close'],colorup='g', colordown='r',width=.8)
ax4.plot(quotes1.index,quotes1.close10,color='r',marker='.',linestyle='-')
ax4.plot(quotes1.index,quotes1.close22,color='k',marker='.',linestyle='-')
ax4.plot(quotes1.index,quotes1.close60,color='b',marker='.',linestyle='-')
plt.show()

#fig.savefig(pdffile, bbox_inches='tight')
for fig in (fig1,fig3): ## will open an empty extra figure :(
pdf.savefig( fig )

pdf.close()
#===========================================================``````

How do we plot heat map by US state or counties in Python?
First to install the package from the commind line:
conda install -c anaconda plotly
conda install -c plotly plotly-geo
conda install -c plotly plotly-orca

``````#===========================================================
#here is an example plot by state in US:
import gzip,  pickle #use gzip to compress the file.
import _pickle as cPickle #cPickle is much more fast

import plotly.graph_objects as go

# Load data frame and tidy it.
import pandas as pd
df.shape

file=r'/Users/*****/onedrive/data_dictionary/state_data'
with gzip.open(file, 'wb') as f_model_pkl:
pickle.dump(df, f_model_pkl)

file=r'/Users/*****/onedrive/data_dictionary/state_data'
test1 = cPickle.load(gzip.open(file, 'rb'), encoding="ASCII")

fig = go.Figure(data=go.Choropleth(
locations=df['code'], # Spatial coordinates
z = df['total exports'].astype(float), # Data to be color-coded
locationmode = 'USA-states', # set of locations match entries in `locations`
colorscale = 'Reds',
colorbar_title = "Millions USD",
))

fig.update_layout(
title_text = '2011 US Agriculture Exports by State',
geo_scope='usa', # limite map scope to USA
)

fig.show(renderer="jpg")
#usually we just use fig.show(), sometimes nothing showing up.

#to see all different kind of renderers
import plotly.io as pio
pio.renderers
pio.renderer='notebook'
#===========================================================``````

Here is another example plotting graph by counties in US:
``````#===========================================================
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
#counties is the dict here gives the boundaries.

file=r'/Users/****/onedrive/data_dictionary/county_data'
with gzip.open(file, 'wb') as f_model_pkl:
pickle.dump(counties, f_model_pkl)

file=r'/Users/****/onedrive/data_dictionary/county_data'
counties = cPickle.load(gzip.open(file, 'rb'), encoding="ASCII")

import pandas as pd
dtype={"fips": str})
#df has two columns: fips, unemp
import plotly.express as px

fig = px.choropleth(df, geojson=counties, locations='fips', color='unemp',
color_continuous_scale="Viridis",
range_color=(0, 12),
scope="usa",
labels={'unemp':'unemployment rate'}
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show(renderer="jpg")
#===========================================================``````

Using the GCP Public Datasets to do geospatial analytics, it has the us_counties datasets for your to plot counties dataset.
``````#===========================================================
from google.cloud import bigquery
from google.cloud import bigquery_storage_v1beta1

# Explicitly create a credentials object. This allows you to use the same
# credentials for both the BigQuery and BigQuery Storage clients, avoiding
# unnecessary API calls to fetch duplicate authentication tokens.
credentials, your_project_id = google.auth.default(
)

# Make clients.
bqclient = bigquery.Client(
credentials=credentials,
project=your_project_id,
)
bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient(
credentials=credentials
)
table = bigquery.TableReference.from_string(
"bigquery-public-data.utility_us.country_code_iso"
)
rows = bqclient.list_rows(
table,
selected_fields=[
bigquery.SchemaField("country_name", "STRING"),
bigquery.SchemaField("fips_code", "STRING"),
],
)
dataframe = rows.to_dataframe(bqstorage_client=bqstorageclient)
query_string = """
SELECT
CONCAT(
'https://stackoverflow.com/questions/',
CAST(id as STRING)) as url,
view_count
FROM `bigquery-public-data.stackoverflow.posts_questions`
WHERE tags like '%google-bigquery%'
ORDER BY view_count DESC
"""

dataframe = (
bqclient.query(query_string)
.result()
.to_dataframe(bqstorage_client=bqstorageclient)
)
table = bigquery_storage_v1beta1.types.TableReference()
table.project_id = "bigquery-public-data"
table.dataset_id = "new_york_trees"
table.table_id = "tree_species"

# Select columns to read with read options. If no read options are
# specified, the whole table is read.

parent = "projects/{}".format(your_project_id)
table,
parent,
# This API can also deliver data serialized in Apache Avro format.
# This example leverages Apache Arrow.
format_=bigquery_storage_v1beta1.enums.DataFormat.ARROW,
# We use a LIQUID strategy in this example because we only read from a
# single stream. Consider BALANCED if you're consuming multiple streams
# concurrently and want more consistent stream sizes.
sharding_strategy=(
bigquery_storage_v1beta1.enums.ShardingStrategy.LIQUID
),
)

# This example reads from only a single stream. Read from multiple streams
# to fetch data faster. Note that the session may not contain any streams
# if there are no rows to read.
stream = session.streams[0]
position = bigquery_storage_v1beta1.types.StreamPosition(stream=stream)

# Parse all Avro blocks and create a dataframe. This call requires a
# session, because the session contains the schema for the row blocks.