#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
#===========================================================================================================
# 2006-2021 Paseman & Associates (www.paseman.com). Use as you see fit, but no guarantees
#===========================================================================================================

Full Index list here:
https://norgatedata.com/data-content-tables.php#ushics
####NORGATE RUNS ON WINDOWS ONLY#####
pip install norgatedata --upgrade
We install Norgate on Mac to make the routines cross platform.

watchlists=['Dow Jones Industrial Average Current & Past', 'Dow Jones Industrial Average',
            'NASDAQ 100 Current & Past', 'NASDAQ 100',
            'Nasdaq Biotechnology Current & Past', 'Nasdaq Biotechnology',
            'Nasdaq Next Generation 100 Current & Past', 'Nasdaq Next Generation 100',
            'Nasdaq Q-50 Current & Past', 'Nasdaq Q-50',
            'Russell 1000 Current & Past', 'Russell 1000',
            'Russell 2000 Current & Past', 'Russell 2000',
            'Russell 3000 Current & Past', 'Russell 3000',
            'Russell Micro Cap Current & Past', 'Russell Micro Cap',
            'Russell Mid Cap Current & Past', 'Russell Mid Cap',
            'S&P 100 Current & Past', 'S&P 100',
            'S&P 500 Current & Past', 'S&P 500', 
            'S&P 500 Dividend Aristocrats Current & Past', 'S&P 500 Dividend Aristocrats',
            'S&P Composite 1500 Current & Past', 'S&P Composite 1500',
            'S&P MidCap 400 Current & Past', 'S&P MidCap 400',
            'S&P SmallCap 600 Current & Past', 'S&P SmallCap 600']

    print(df["BMRN"].loc['2020-12-03':])
    changeList=df[df["BMRN"] != 0]["BMRN"]
    print(changeList,pd.to_datetime(str(changeList.index.values[0])).strftime("%m/%d/%Y"),changeList.values)

Directory Structure at end of date/time path e.g. 20210326_1300
*.csv                  <- time (x) vs. symbol (y) incidence matricies from Norgate for each index
misc/companyInfo.csv   <- companyInfo file from Norgate (two columsns: symbol, companyName) used to make Don files
misc/latestSymbols.csv <- symbol (x) vs. index (y) incidence matricies (single file)
Don/*.csv              <- ticker (x) vs. time (y) incidence matricies in Don Format for each index
changelist/*.csv       <- Date, AddSymbol, DeleteSymbol tables for each index
"""

#https://pypi.org/project/norgatedata/
import norgatedata as ngd
import pandas as pd
import numpy as np
import os
import glob
flatten = lambda l: [item for sublist in l for item in sublist]

#===========================================================================================================
def getIndexConstituent(ticker,indexname,priceadjust,padding_setting,timeseriesformat):
  pricedata_df = ngd.price_timeseries(
      ticker,
      stock_price_adjustment_setting = priceadjust,
      padding_setting = padding_setting,
      timeseriesformat = timeseriesformat,
  )
  # and now make the call to index_constituent_timeseries to add "Index Constituent" column
  pricedata_df2 = ngd.index_constituent_timeseries(
      ticker,
      indexname,
      padding_setting = padding_setting,
      limit = -1,
      pandas_dataframe = pricedata_df,
      timeseriesformat = timeseriesformat,
  )
  pricedata_df2.rename(columns={"Index Constituent":ticker}, inplace=True)
  return pricedata_df2[ticker]

def genNorgateMatrix(path):
  print("\nRunning genNorgateMatrix")
  if not os.access(path, os.R_OK): os.makedirs(path)
  with open(path+'/log.txt', 'w') as logF:
    # https://pypi.org/project/norgatedata/
    priceadjust = ngd.StockPriceAdjustmentType.TOTALRETURN
    padding_setting = ngd.PaddingType.NONE
    timeseriesformat = 'pandas-dataframe'

    # index_constituent_timeseries takes an index, not a watch list, so do I do this?
    currentAndPast = " Current & Past"

    allwatchlistnames = ngd.watchlists()
    logF.write("allwatchlistnames:"+str(allwatchlistnames)+"\n\n")
    # see Illustration 1 here: https://norgatedata.com/ndu-watchlist-library.php
    currentAndPastWatchlistNames = [w for w in allwatchlistnames if currentAndPast in w]
    logF.write(str(currentAndPastWatchlistNames))
    for watchlistname in currentAndPastWatchlistNames:
      indexname = watchlistname[:watchlistname.index(currentAndPast)] #?? like this?
      symbols=ngd.watchlist_symbols(watchlistname)
      logF.write("\n\n%s %d >%s<\n%s\n"%(watchlistname,len(symbols),indexname,str(symbols)))
    for watchlistname in currentAndPastWatchlistNames:
      indexname = watchlistname[:watchlistname.index(currentAndPast)] #?? like this?
      symbols=ngd.watchlist_symbols(watchlistname)
      logF.write("\n\n>"+indexname+"<\n")#+str(symbols)
      try:
        dfs=[getIndexConstituent(symbol,indexname.replace("Nasdaq","NASDAQ"), #20210324 Workaround Norgate Bug#1
                                 priceadjust,padding_setting,timeseriesformat) for symbol in symbols]
        MatrixDf=pd.concat(dfs,axis=1)
        #logF.write(MatrixDf.to_string())
        MatrixDf.to_csv(path+"/"+indexname+".csv")
      except:
        print("oops: Failed with: ",indexname)
        
#===========================================================================================================
def genCorporateInfo(path):
  print("\nRunning genCorporateInfo")
  currentAndPast = " Current & Past"
  allwatchlistnames = ngd.watchlists()
  currentAndPastWatchlistNames = [w for w in allwatchlistnames if currentAndPast in w]
  symbols=set()
  for watchlistname in currentAndPastWatchlistNames:
    symbols.update(set(ngd.watchlist_symbols(watchlistname)))
  symbols=sorted(list(symbols))
  names = [ngd.security_name(symbol) for symbol in symbols]

  path=path+"/misc"
  if not os.access(path, os.R_OK): os.makedirs(path)
  pd.DataFrame(data={"symbol":symbols,"name":names}).to_csv(path+"/companyInfo.csv")
#===========================================================================================================
# Routines above this point use Noraget to create intermediate files required for the routines below.
# No reliance on NorgateData below this point
#===========================================================================================================
def incidenceMatrixToChangeList(inputpath,printNoChange=True,filenames=None):
  print("\nRunning incidenceMatrixToChangeList")
  """Changelist Format is Date, AddedTicker, DeletedTicker"""
  if filenames==None: filenames=glob.glob(inputpath+"/*.csv")
  outputpath=inputpath+"/changelist"
  if not os.access(outputpath, os.R_OK): os.makedirs(outputpath)
  for filename in filenames:
    print(filename)
    df0 = pd.read_csv(filename,  header=0, index_col=0, parse_dates=True).fillna(0) # fillna(0) (hopefully) fixes Northgate bug#2 by turning all NaNs to 0.0
    df=df0-df0.shift(1) # Now, 1.0 => add; -1.0 => delete; 0.0 => No change
    df=df[1:]           # Remove First Row
    lines=[]
    for ticker in df.columns:
      changeList=df[df[ticker] != 0][ticker]  # Strip out all "no change" rows and leave the add/deletes behind
      if len(changeList) == 0 and printNoChange: print("%s\t-\t-"%ticker)
      else:
        for date,value in zip(changeList.index.values,changeList.values):
          date=str(date)
          date=date[0:4]+date[5:7]+date[8:10]  #Convert to YYYYMMDD format
          # E.g. CRM appeared on 8/31/2020; XOM disappeared on 8/31/2021
          # Date        Add     Delete
          # 20200831	CRM	-
          # 20200831	-	XOM
          if value==1.0: line = (date,ticker,"")
          elif  value==-1.0: line = (date,"",ticker)
          else: line = (date,ticker,"ERROR")
          lines.append(line)
    CLdf=pd.DataFrame(data=lines,columns=["Date","Add","Delete"]).sort_values(by=['Date', 'Add', 'Delete'])
    CLdf.to_csv(filename.replace(inputpath,outputpath))
    #print(CLdf.sort_values(by=['Date', 'Add', 'Delete']))
#===========================================================================================================
def xlat(dictionary,key):
  if key in dictionary: return dictionary[key]
  print("error: ",key)
  return "***ERROR***"

def incidenceMatrixToDonFormat(inputpath):
  print("\nRunning incidenceMatrixToDonFormat")
  filenames=glob.glob(inputpath+"/*.csv")
  outputpath=inputpath+"/Don"
  if not os.access(outputpath, os.R_OK): os.makedirs(outputpath)
  CIdf = pd.read_csv(inputpath+"/misc/companyInfo.csv")
  x = dict(zip(CIdf["symbol"].values,CIdf["name"].values))
  for filename in filenames:
    print(filename)
    # fillna(0) (hopefully) fixes Northgate bug#2 by turning all NaNs to 0.0
    df = pd.read_csv(filename,  header=0, index_col=0, parse_dates=True).fillna(0).resample('BM').last().replace(0.0," ").replace(0," ").replace(1.0,'X').replace(1,'X').transpose()
    df.columns=[column.strftime("%m/%d/%Y") for column in df.columns]  # Put in timeformat that Don likes.
    df.insert(0, 'Name', [xlat(x,symbol) for symbol in df.index.values]) # insert company names associated with ticker symbols in first column
    df.to_csv(filename.replace(inputpath,outputpath))
#===========================================================================================================
def makeLatestSymbolsMatrix(path,filenames=None):
  print("\nRunning makeLatestSymbolsMatrix")
  if filenames==None: filenames=glob.glob(path+"/*.csv")
  d={} # Dictionary of symbol lists indexed by stock index name
  for filename in filenames:
    df0 = pd.read_csv(filename,  header=0, index_col=0, parse_dates=True).fillna(0).iloc[-1] # fillna(0) (hopefully) fixes Northgate bug#2 by turning all NaNs to 0.0
    #d[filename[1+filename.rfind("/"):-4]]=list(np.compress(df0.values,df0.index.values)) # Windows uses \ not /
    d[filename[1+len(path):-4]]=list(np.compress(df0.values,df0.index.values)) # Convert incidence matrix to list of ticker names
  tickers = sorted(list(set(flatten(d.values()))))
  # Incidence matrix of tickers(rows) vs index name(columns)
  df=pd.DataFrame({key:[ticker in values for ticker in tickers] for key,values in d.items()},index=tickers).replace(False,0).replace(True,1)
  path=path+"/misc"
  if not os.access(path, os.R_OK): os.makedirs(path)
  df.to_csv(path+"/latestSymbols.csv")
  return df

def getLatestYahooSymbols(path,idx):
  df = pd.read_csv(path+"/misc/latestSymbols.csv",  header=0, index_col=0).replace(0,False).replace(1,True)
  #Convert ticker names from Norgate Format BRK.B to yahoo format BRK-B.
  return [s.replace('.','-') for s in list(np.compress(df[idx].values,df.index))]
  
#===========================================================================================================

if __name__ == '__main__':
  path="20210327_1500"
  genNorgateMatrix(path)
  genCorporateInfo(path)
  incidenceMatrixToDonFormat(path)
  incidenceMatrixToChangeList(path,printNoChange=False)#,filenames=["old/20210323_1230/NASDAQ 100.csv"])
  df0=makeLatestSymbolsMatrix(path)#,filenames=["old/20210323_1230/NASDAQ 100.csv"])
  #print(df0)
  #print(list(df0.columns).index("NASDAQ 100"))
  #tickers = getLatestYahooSymbols("20210325_1900","NASDAQ 100")
  #print(len(tickers),tickers)