#!/usr/bin/env python # -*- coding: utf-8 -*- """ #=========================================================================================================== # 2006-2021 Paseman & Associates (www.paseman.com). Use as you see fit, but no guarantees #=========================================================================================================== Full Index list here: https://norgatedata.com/data-content-tables.php#ushics ####NORGATE RUNS ON WINDOWS ONLY##### pip install norgatedata --upgrade We install Norgate on Mac to make the routines cross platform. watchlists=['Dow Jones Industrial Average Current & Past', 'Dow Jones Industrial Average', 'NASDAQ 100 Current & Past', 'NASDAQ 100', 'Nasdaq Biotechnology Current & Past', 'Nasdaq Biotechnology', 'Nasdaq Next Generation 100 Current & Past', 'Nasdaq Next Generation 100', 'Nasdaq Q-50 Current & Past', 'Nasdaq Q-50', 'Russell 1000 Current & Past', 'Russell 1000', 'Russell 2000 Current & Past', 'Russell 2000', 'Russell 3000 Current & Past', 'Russell 3000', 'Russell Micro Cap Current & Past', 'Russell Micro Cap', 'Russell Mid Cap Current & Past', 'Russell Mid Cap', 'S&P 100 Current & Past', 'S&P 100', 'S&P 500 Current & Past', 'S&P 500', 'S&P 500 Dividend Aristocrats Current & Past', 'S&P 500 Dividend Aristocrats', 'S&P Composite 1500 Current & Past', 'S&P Composite 1500', 'S&P MidCap 400 Current & Past', 'S&P MidCap 400', 'S&P SmallCap 600 Current & Past', 'S&P SmallCap 600'] print(df["BMRN"].loc['2020-12-03':]) changeList=df[df["BMRN"] != 0]["BMRN"] print(changeList,pd.to_datetime(str(changeList.index.values[0])).strftime("%m/%d/%Y"),changeList.values) Directory Structure at end of date/time path e.g. 20210326_1300 *.csv <- time (x) vs. symbol (y) incidence matricies from Norgate for each index misc/companyInfo.csv <- companyInfo file from Norgate (two columsns: symbol, companyName) used to make Don files misc/latestSymbols.csv <- symbol (x) vs. index (y) incidence matricies (single file) Don/*.csv <- ticker (x) vs. time (y) incidence matricies in Don Format for each index changelist/*.csv <- Date, AddSymbol, DeleteSymbol tables for each index """ #https://pypi.org/project/norgatedata/ import norgatedata as ngd import pandas as pd import numpy as np import os import glob flatten = lambda l: [item for sublist in l for item in sublist] #=========================================================================================================== def getIndexConstituent(ticker,indexname,priceadjust,padding_setting,timeseriesformat): pricedata_df = ngd.price_timeseries( ticker, stock_price_adjustment_setting = priceadjust, padding_setting = padding_setting, timeseriesformat = timeseriesformat, ) # and now make the call to index_constituent_timeseries to add "Index Constituent" column pricedata_df2 = ngd.index_constituent_timeseries( ticker, indexname, padding_setting = padding_setting, limit = -1, pandas_dataframe = pricedata_df, timeseriesformat = timeseriesformat, ) pricedata_df2.rename(columns={"Index Constituent":ticker}, inplace=True) return pricedata_df2[ticker] def genNorgateMatrix(path): print("\nRunning genNorgateMatrix") if not os.access(path, os.R_OK): os.makedirs(path) with open(path+'/log.txt', 'w') as logF: # https://pypi.org/project/norgatedata/ priceadjust = ngd.StockPriceAdjustmentType.TOTALRETURN padding_setting = ngd.PaddingType.NONE timeseriesformat = 'pandas-dataframe' # index_constituent_timeseries takes an index, not a watch list, so do I do this? currentAndPast = " Current & Past" allwatchlistnames = ngd.watchlists() logF.write("allwatchlistnames:"+str(allwatchlistnames)+"\n\n") # see Illustration 1 here: https://norgatedata.com/ndu-watchlist-library.php currentAndPastWatchlistNames = [w for w in allwatchlistnames if currentAndPast in w] logF.write(str(currentAndPastWatchlistNames)) for watchlistname in currentAndPastWatchlistNames: indexname = watchlistname[:watchlistname.index(currentAndPast)] #?? like this? symbols=ngd.watchlist_symbols(watchlistname) logF.write("\n\n%s %d >%s<\n%s\n"%(watchlistname,len(symbols),indexname,str(symbols))) for watchlistname in currentAndPastWatchlistNames: indexname = watchlistname[:watchlistname.index(currentAndPast)] #?? like this? symbols=ngd.watchlist_symbols(watchlistname) logF.write("\n\n>"+indexname+"<\n")#+str(symbols) try: dfs=[getIndexConstituent(symbol,indexname.replace("Nasdaq","NASDAQ"), #20210324 Workaround Norgate Bug#1 priceadjust,padding_setting,timeseriesformat) for symbol in symbols] MatrixDf=pd.concat(dfs,axis=1) #logF.write(MatrixDf.to_string()) MatrixDf.to_csv(path+"/"+indexname+".csv") except: print("oops: Failed with: ",indexname) #=========================================================================================================== def genCorporateInfo(path): print("\nRunning genCorporateInfo") currentAndPast = " Current & Past" allwatchlistnames = ngd.watchlists() currentAndPastWatchlistNames = [w for w in allwatchlistnames if currentAndPast in w] symbols=set() for watchlistname in currentAndPastWatchlistNames: symbols.update(set(ngd.watchlist_symbols(watchlistname))) symbols=sorted(list(symbols)) names = [ngd.security_name(symbol) for symbol in symbols] path=path+"/misc" if not os.access(path, os.R_OK): os.makedirs(path) pd.DataFrame(data={"symbol":symbols,"name":names}).to_csv(path+"/companyInfo.csv") #=========================================================================================================== # Routines above this point use Noraget to create intermediate files required for the routines below. # No reliance on NorgateData below this point #=========================================================================================================== def incidenceMatrixToChangeList(inputpath,printNoChange=True,filenames=None): print("\nRunning incidenceMatrixToChangeList") """Changelist Format is Date, AddedTicker, DeletedTicker""" if filenames==None: filenames=glob.glob(inputpath+"/*.csv") outputpath=inputpath+"/changelist" if not os.access(outputpath, os.R_OK): os.makedirs(outputpath) for filename in filenames: print(filename) df0 = pd.read_csv(filename, header=0, index_col=0, parse_dates=True).fillna(0) # fillna(0) (hopefully) fixes Northgate bug#2 by turning all NaNs to 0.0 df=df0-df0.shift(1) # Now, 1.0 => add; -1.0 => delete; 0.0 => No change df=df[1:] # Remove First Row lines=[] for ticker in df.columns: changeList=df[df[ticker] != 0][ticker] # Strip out all "no change" rows and leave the add/deletes behind if len(changeList) == 0 and printNoChange: print("%s\t-\t-"%ticker) else: for date,value in zip(changeList.index.values,changeList.values): date=str(date) date=date[0:4]+date[5:7]+date[8:10] #Convert to YYYYMMDD format # E.g. CRM appeared on 8/31/2020; XOM disappeared on 8/31/2021 # Date Add Delete # 20200831 CRM - # 20200831 - XOM if value==1.0: line = (date,ticker,"") elif value==-1.0: line = (date,"",ticker) else: line = (date,ticker,"ERROR") lines.append(line) CLdf=pd.DataFrame(data=lines,columns=["Date","Add","Delete"]).sort_values(by=['Date', 'Add', 'Delete']) CLdf.to_csv(filename.replace(inputpath,outputpath)) #print(CLdf.sort_values(by=['Date', 'Add', 'Delete'])) #=========================================================================================================== def xlat(dictionary,key): if key in dictionary: return dictionary[key] print("error: ",key) return "***ERROR***" def incidenceMatrixToDonFormat(inputpath): print("\nRunning incidenceMatrixToDonFormat") filenames=glob.glob(inputpath+"/*.csv") outputpath=inputpath+"/Don" if not os.access(outputpath, os.R_OK): os.makedirs(outputpath) CIdf = pd.read_csv(inputpath+"/misc/companyInfo.csv") x = dict(zip(CIdf["symbol"].values,CIdf["name"].values)) for filename in filenames: print(filename) # fillna(0) (hopefully) fixes Northgate bug#2 by turning all NaNs to 0.0 df = pd.read_csv(filename, header=0, index_col=0, parse_dates=True).fillna(0).resample('BM').last().replace(0.0," ").replace(0," ").replace(1.0,'X').replace(1,'X').transpose() df.columns=[column.strftime("%m/%d/%Y") for column in df.columns] # Put in timeformat that Don likes. df.insert(0, 'Name', [xlat(x,symbol) for symbol in df.index.values]) # insert company names associated with ticker symbols in first column df.to_csv(filename.replace(inputpath,outputpath)) #=========================================================================================================== def makeLatestSymbolsMatrix(path,filenames=None): print("\nRunning makeLatestSymbolsMatrix") if filenames==None: filenames=glob.glob(path+"/*.csv") d={} # Dictionary of symbol lists indexed by stock index name for filename in filenames: df0 = pd.read_csv(filename, header=0, index_col=0, parse_dates=True).fillna(0).iloc[-1] # fillna(0) (hopefully) fixes Northgate bug#2 by turning all NaNs to 0.0 #d[filename[1+filename.rfind("/"):-4]]=list(np.compress(df0.values,df0.index.values)) # Windows uses \ not / d[filename[1+len(path):-4]]=list(np.compress(df0.values,df0.index.values)) # Convert incidence matrix to list of ticker names tickers = sorted(list(set(flatten(d.values())))) # Incidence matrix of tickers(rows) vs index name(columns) df=pd.DataFrame({key:[ticker in values for ticker in tickers] for key,values in d.items()},index=tickers).replace(False,0).replace(True,1) path=path+"/misc" if not os.access(path, os.R_OK): os.makedirs(path) df.to_csv(path+"/latestSymbols.csv") return df def getLatestYahooSymbols(path,idx): df = pd.read_csv(path+"/misc/latestSymbols.csv", header=0, index_col=0).replace(0,False).replace(1,True) #Convert ticker names from Norgate Format BRK.B to yahoo format BRK-B. return [s.replace('.','-') for s in list(np.compress(df[idx].values,df.index))] #=========================================================================================================== if __name__ == '__main__': path="20210327_1500" genNorgateMatrix(path) genCorporateInfo(path) incidenceMatrixToDonFormat(path) incidenceMatrixToChangeList(path,printNoChange=False)#,filenames=["old/20210323_1230/NASDAQ 100.csv"]) df0=makeLatestSymbolsMatrix(path)#,filenames=["old/20210323_1230/NASDAQ 100.csv"]) #print(df0) #print(list(df0.columns).index("NASDAQ 100")) #tickers = getLatestYahooSymbols("20210325_1900","NASDAQ 100") #print(len(tickers),tickers)