1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #ifndef _AO_H 28 #define _AO_H 29 30 #pragma ident "%Z%%M% %I% %E% SMI" 31 32 #include <sys/types.h> 33 #include <sys/mc.h> 34 #include <sys/mca_amd.h> 35 #include <sys/mc_amd.h> 36 #include <sys/cpu_module_impl.h> 37 #include <sys/nvpair.h> 38 #include <sys/cyclic.h> 39 #include <sys/errorq.h> 40 #include <sys/kobj.h> 41 #include <sys/fm/util.h> 42 43 #ifdef __cplusplus 44 extern "C" { 45 #endif 46 47 #define AO_MCA_MAX_ERRORS 10 48 49 typedef struct ao_data ao_data_t; 50 51 typedef struct ao_bank_regs { 52 uint32_t abr_status; 53 uint32_t abr_addr; 54 uint32_t abr_misc; 55 } ao_bank_regs_t; 56 57 extern ao_bank_regs_t ao_bank_regs[AMD_MCA_BANK_COUNT]; 58 59 /* 60 * Rather than using torturous conditionals, we match errors using a table of 61 * ao_error_disp_t's. The members in the ao_error_disp_t are matched against 62 * the value of MCi_STATUS, with a successful match indicating that the given 63 * error occurred. 64 * 65 * While aed_stat_code will match most of the status code bits, a few of the 66 * status code fields are either/or, and are treated separately so as to 67 * minimize the number of ao_error_disp_t structures that must be created. 68 * For example, the dc.tag_par error can have r4 values drd or dwr. Rather 69 * than creating two ao_error_disp_t's, we use the separate aed_stat_r4_bits 70 * field to indicate both AO_MCA_R4_BIT_DRD and AO_MCA_R4_BIT_DWD. As the 71 * matching r4 values are drawn from aed_stat_r4_bits, we don't use the r4 72 * bits in aed_stat_code for matching. Similar reasoning lies behind the 73 * creation of the pp and ii fields. 74 */ 75 #define AO_AED_PANIC_NEVER 0x00 76 #define AO_AED_PANIC_IFMCE 0x01 77 #define AO_AED_PANIC_ALWAYS 0x80 78 79 #define AO_AED_F_CORRECTABLE 0x01 80 #define AO_AED_F_LOFAULT_OK 0x02 81 #define AO_AED_F_LINEAR 0x04 /* MCi_ADDR is a linear address */ 82 #define AO_AED_F_PHYSICAL 0x08 /* MCi_ADDR is a physical address */ 83 #define AO_AED_F_PAGEALIGNED 0x10 /* MCi_ADDR aligns to page size */ 84 #define AO_AED_F_L2SETWAY 0x20 /* 3:0 = way, 15/14/13/12:6 = set */ 85 86 #define AO_AED_FLAGS_ADDRTYPE (AO_AED_F_LINEAR | AO_AED_F_PHYSICAL | \ 87 AO_AED_F_PAGEALIGNED | AO_AED_F_L2SETWAY) 88 89 typedef struct ao_error_disp { 90 const char *aed_class; /* ereport class for use if match */ 91 uint64_t aed_ereport_members; /* ereport contents flags if match */ 92 uint64_t aed_stat_mask; /* status msr bits for match */ 93 uint64_t aed_stat_mask_res; /* status mask result for match */ 94 uint16_t aed_stat_code; /* status code for match */ 95 uint8_t aed_stat_extcode; /* extended status code for match */ 96 uint8_t aed_stat_pp_bits:4; /* AO_MCA_PP_BIT_* for pp matching */ 97 uint8_t aed_stat_ii_bits:4; /* AO_MCA_II_BIT_* for ii matching */ 98 uint16_t aed_stat_r4_bits; /* AO_MCA_R4_BIT_* for r4 matching */ 99 uint8_t aed_addrvalid_hi; /* most significant valid addr bit */ 100 uint8_t aed_addrvalid_lo; /* least significant valid addr bit */ 101 uint8_t aed_panic_when; /* extra conditions for panic */ 102 uint8_t aed_flags; /* AO_AED_F_* */ 103 } ao_error_disp_t; 104 105 /* 106 * The poller has two parts. First is the omni cyclic, which runs on all 107 * CPUs, and which polls the error MSRs at some fixed (long) interval. This 108 * cyclic will run on all machines, all the time, and thus must have minimal 109 * runtime impact. The second portion of the poller is manually-initiated, and 110 * is used by the error injector/synthesizer to request an immediate poll of the 111 * error state registers. 112 * 113 * With this number of moving parts, it is essential that we have some sort of 114 * audit log for post-mortem analysis. A circular array of trace buffers 115 * (ao_mca_poll_trace_t structures) is kept to record this activity. Whenever 116 * an event occurs that is of interest to the poller, an entry is made in 117 * the trace array describing that event. 118 */ 119 #define AO_MPT_WHAT_CYC_ERR 0 /* cyclic-induced poll */ 120 #define AO_MPT_WHAT_POKE_ERR 1 /* manually-induced poll */ 121 #define AO_MPT_WHAT_UNFAULTING 2 /* discarded error state */ 122 123 typedef struct ao_mca_poll_trace { 124 hrtime_t mpt_when; /* timestamp of event */ 125 uint8_t mpt_what; /* AO_MPT_WHAT_* (which event?) */ 126 uint8_t mpt_nerr; /* number of errors discovered */ 127 uint16_t mpt_pad1; 128 uint32_t mpt_pad2; 129 } ao_mca_poll_trace_t; 130 131 /* 132 * Processor error state is saved in logout areas. There are three separate 133 * logout areas, each used for a different purpose. The logout areas are stored 134 * in an array (ao_mca_logout), indexed by the AO_MCA_LOGOUT_* macros. 135 * 136 * The save areas are: 137 * 138 * 1. Exception handler MSR save - Written to by the initial portion of the #mc 139 * handler. Read from by the main body of the exception handler. 140 * 141 * 3. Poller MSR save - Used by the poller to store error state MSR values. 142 * While this logout area doesn't necessarily have to live in the ao_mca_t, 143 * it does so to enhance observability. 144 * 145 * The logout areas contain both global error state (acl_ip, acl_timestamp, 146 * etc.), as well as a bank array. The bank array contains one ao_bank_logout_t 147 * per error reporting bank. 148 */ 149 150 typedef struct ao_bank_logout { 151 uint64_t abl_status; /* Saved MCi_STATUS register */ 152 uint64_t abl_addr; /* Saved MCi_ADDR register */ 153 uint64_t abl_misc; /* Saved MCi_MISC register */ 154 uint8_t abl_addr_type; /* flags & AO_AED_FLAGS_ADDRTYPE */ 155 uint8_t abl_addr_valid_hi; /* most significant valid addr bit */ 156 uint8_t abl_addr_valid_lo; /* least significant valid addr bit */ 157 } ao_bank_logout_t; 158 159 #define AO_ACL_F_PRIV 0x1 /* #mc in kernel mode (else user) */ 160 #define AO_ACL_F_FATAL 0x2 /* logout detected fatal error(s) */ 161 162 typedef struct ao_cpu_logout { 163 ao_data_t *acl_ao; /* pointer to per-cpu ao_data_t */ 164 uintptr_t acl_ip; /* instruction pointer if #mc trap */ 165 uint64_t acl_timestamp; /* gethrtime() at time of logout */ 166 uint64_t acl_mcg_status; /* MCG_STATUS register value */ 167 ao_bank_logout_t acl_banks[AMD_MCA_BANK_COUNT]; /* bank state saves */ 168 pc_t acl_stack[FM_STK_DEPTH]; /* saved stack trace (if any) */ 169 int acl_stackdepth; /* saved stack trace depth */ 170 uint_t acl_flags; /* flags (see AO_ACL_F_* above) */ 171 } ao_cpu_logout_t; 172 173 /* Index for ao_mca_logout, below */ 174 #define AO_MCA_LOGOUT_EXCEPTION 0 175 #define AO_MCA_LOGOUT_POLLER 1 176 #define AO_MCA_LOGOUT_NUM 2 177 178 #define AO_MCA_F_UNFAULTING 0x1 /* CPU exiting faulted state */ 179 180 /* 181 * We store config as inherited from the BIOS to assist in troubleshooting. 182 * The NorthBridge config is stored in the chipshared structure below. 183 */ 184 typedef struct ao_bios_cfg { 185 uint64_t bcfg_bank_ctl[AMD_MCA_BANK_COUNT]; 186 uint64_t bcfg_bank_mask[AMD_MCA_BANK_COUNT]; 187 uint64_t bcfg_bank_misc[AMD_MCA_BANK_COUNT]; 188 } ao_bios_cfg_t; 189 190 /* 191 * The master data structure used to hold MCA-related state. 192 */ 193 typedef struct ao_mca { 194 ao_bios_cfg_t ao_mca_bios_cfg; /* Bank and NB config before our init */ 195 ao_cpu_logout_t ao_mca_logout[AO_MCA_LOGOUT_NUM]; /* save areas */ 196 kmutex_t ao_mca_poll_lock; /* keep pollers from colliding */ 197 ao_mca_poll_trace_t *ao_mca_poll_trace; /* trace buffers for this cpu */ 198 uint_t ao_mca_poll_curtrace; /* most recently-filled trace buffer */ 199 uint_t ao_mca_flags; /* AO_MCA_F_* */ 200 } ao_mca_t; 201 202 /* 203 * Per-chip state 204 */ 205 struct ao_chipshared { 206 uint32_t aos_chiprev; /* Chip revision */ 207 volatile ulong_t aos_cfgonce; /* Config performed once per chip */ 208 kmutex_t aos_nb_poll_lock; /* Keep NB pollers from colliding */ 209 uint64_t aos_nb_poll_timestamp; /* Timestamp of last NB poll */ 210 int aos_nb_poll_owner; /* The cpuid of current NB poller */ 211 uint64_t aos_bcfg_nb_ctl; /* BIOS value of MC4_CTL */ 212 uint64_t aos_bcfg_nb_mask; /* BIOS value of MC4_MASK */ 213 uint64_t aos_bcfg_nb_misc; /* BIOS value of MC4_MISC */ 214 uint32_t aos_bcfg_nb_cfg; /* BIOS value of NB MCA Config */ 215 uint32_t aos_bcfg_nb_sparectl; /* BIOS value of Online Spare Control */ 216 uint32_t aos_bcfg_dcfg_lo; /* BIOS value of DRAM Config Low */ 217 uint32_t aos_bcfg_dcfg_hi; /* BIOS value of DRAM Config High */ 218 }; 219 220 /* Bit numbers for aos_cfgonce */ 221 enum ao_cfgonce_bitnum { 222 AO_CFGONCE_NBMCA, 223 AO_CFGONCE_DRAMCFG 224 }; 225 226 /* 227 * Per-CPU state 228 */ 229 struct ao_data { 230 ao_mca_t ao_mca; /* MCA state for this CPU */ 231 cpu_t *ao_cpu; /* link to CPU's cpu_t */ 232 const cmi_mc_ops_t *ao_mc_ops; /* memory controller ops */ 233 void *ao_mc_data; /* argument for MC ops */ 234 struct ao_chipshared *ao_shared; /* Shared state for the chip */ 235 }; 236 237 #ifdef _KERNEL 238 239 struct regs; 240 241 extern errorq_t *ao_mca_queue; 242 extern const cmi_ops_t _cmi_ops; 243 244 extern void ao_faulted_enter(void *); 245 extern void ao_faulted_exit(void *); 246 extern int ao_scrubber_enable(void *, uint64_t, uint64_t, int); 247 248 extern void ao_mca_post_init(void *); 249 extern void ao_mca_init(void *); 250 extern int ao_mca_trap(void *, struct regs *); 251 extern int ao_mca_inject(void *, cmi_mca_regs_t *, uint_t); 252 extern void ao_mca_poke(void *); 253 extern void ao_mca_poll_init(ao_data_t *, int); 254 extern void ao_mca_poll_start(void); 255 256 extern int ao_mca_logout(ao_cpu_logout_t *, struct regs *, int *, int, 257 uint32_t); 258 extern void ao_mca_drain(void *, const void *, const errorq_elem_t *); 259 extern nvlist_t *ao_fmri_create(ao_data_t *, nv_alloc_t *); 260 261 extern void ao_mc_register(void *, const cmi_mc_ops_t *, void *); 262 extern const struct cmi_mc_ops *ao_mc_getops(void *); 263 extern int ao_mc_patounum(ao_data_t *, uint64_t, uint8_t, uint8_t, uint32_t, 264 int, mc_unum_t *); 265 extern int ao_mc_unumtopa(ao_data_t *, mc_unum_t *, nvlist_t *, uint64_t *); 266 267 extern void ao_pcicfg_write(uint_t, uint_t, uint_t, uint32_t); 268 extern uint32_t ao_pcicfg_read(uint_t, uint_t, uint_t); 269 270 extern int ao_chip_once(ao_data_t *, enum ao_cfgonce_bitnum); 271 272 #endif /* _KERNEL */ 273 274 #ifdef __cplusplus 275 } 276 #endif 277 278 #endif /* _AO_H */ 279