1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27#pragma dictionary "AMD" 28 29/* 30 * Eversholt rules for the AMD Opteron CPU/Memory 31 */ 32 33#define MAX(x, y) ((x) >= (y) ? (x) : (y)) 34#define MIN(x, y) ((x) <= (y) ? (x) : (y)) 35 36/* 37 * SET_ADDR and SET_OFFSET are used to set a payload value in the fault that 38 * we diagnose for page faults, to record the physical address of the faulting 39 * page. 40 */ 41#define SET_ADDR (setpayloadprop("asru-physaddr", payloadprop("IA32_MCi_ADDR"))) 42 43#define SET_OFFSET (setpayloadprop("asru-offset", \ 44 payloadprop("resource[0].hc-specific.offset"))) 45 46/* 47 * RESOURCE_EXISTS is true if a member with name "resource" exists in the 48 * payload - regardless of type (e.g., nvlist or nvlist array) or value. 49 */ 50#define RESOURCE_EXISTS (payloadprop_defined("resource")) 51 52/* 53 * CONTAINS_RANK is true if the "resource" nvlist array (as used in memory 54 * ereports) exists and one if its members matches the path for the 55 * rank node. Our memory propogation are of the form 56 * 57 * "prop foo@chip/memory-controller/dimm/rank -> blah@chip/core/strand" 58 * 59 * since cpus detect memory errors; in eversholt such a propogation, where 60 * the lhs path and rhs path do not match, expands to the cross-product of 61 * all dimms, ranks and cpus on the same chip (since chip appears in the 62 * path on both sides). We use CONTAINS_RANK to constrain the propogation 63 * such that it only happens if the payload resource matches the rank. 64 */ 65#define CONTAINS_RANK (payloadprop_contains("resource", \ 66 asru(chip/memory-controller/dimm/rank)) \ 67 || payloadprop_contains("resource", \ 68 asru(chip/memory-controller/dimm))) 69 70/* 71 * The following will tell us whether a syndrome that is known to be 72 * correctable (from a mem_ce ereport) is single-bit or multi-bit. For a 73 * correctable ChipKill syndrome the number of bits set in the lowest 74 * nibble indicates how many bits were in error. 75 */ 76 77#define CBITMASK(synd) ((synd) & 0xf) 78 79#define CKSINGLE(synd) \ 80 ((synd) == 0 || \ 81 (CBITMASK(synd) == 0x1 || CBITMASK(synd) == 0x2 || \ 82 CBITMASK(synd) == 0x4 || CBITMASK(synd) == 0x8)) 83 84#define SINGLE_BIT_CE \ 85 (payloadprop("syndrome-type") == "E" || \ 86 (payloadprop("syndrome-type") == "C4" && \ 87 CKSINGLE(payloadprop("syndrome")))) 88 89#define MULTI_BIT_CE \ 90 (payloadprop("syndrome-type") == "C4" && \ 91 !CKSINGLE(payloadprop("syndrome"))) 92 93/* #PAGE# 94 * #DIMM_SCU# 95 * A single bit fault in a memory rank can cause: 96 * 97 * - mem_ce : reported by nb 98 * - inf_sys_ecc1: reported by ic or dc; inf_sys_ecc1 errors detected at the 99 * ic do not record a syndrome; these errors will not be triggered in 100 * ChipKill ECC mode (the NB corrects all ECC errors in that mode) 101 * - s_ecc1: reported by bu; this error will not be triggered in ChipKill 102 * ECC mode (the NB corrects all ECC in that mode) 103 * 104 * Single-bit errors are fed into a per-rank SERD engine; if a SERD engine 105 * trips we diagnose a fault.memory.page so that the response agent can 106 * retire the page that caused the trip. If the total number of pages 107 * faulted in this way on a single rank exceeds a threshold we will 108 * diagnose a fault.memory.dimm_sb against the containing dimm. 109 * 110 * Multibit ChipKill-correctable errors are treated identically to 111 * single-bit errors, but via separate serd engines to allow distinct 112 * parameters if desired. 113 * 114 * Uncorrectable errors produce an immediate page fault and corresponding 115 * fault.memory.dimm_ue. 116 * 117 * Page faults are essentially internal - action is only required when 118 * they are accompanied by a dimm fault. As such we include message=0 119 * on page faults. 120 */ 121 122event ereport.cpu.amd.ic.inf_sys_ecc1@chip/core/strand{within(5s)}; 123event ereport.cpu.amd.dc.inf_sys_ecc1@chip/core/strand{within(5s)}; 124event ereport.cpu.amd.bu.s_ecc1@chip/core/strand{within(5s)}; 125event ereport.cpu.amd.nb.mem_ce@chip/core/strand{within(5s)}; 126 127/* 128 * Single-bit correctable errors feed into per-rank 129 * SERD engines which diagnose fault.memory.page_sb if they trip. 130 * 131 * Multi-bit correctable (via ChipKill) errors feed 132 * into additional per-rank SERD engines which diagnose fault.memory.page_ck 133 * if they trip. 134 * 135 * The number of fault.memory.page and fault.memory.page_ck diagnosed is 136 * counted in stat engines for each type. These are used in deciding 137 * whether to declare a dimm faulty after repeated page faults. 138 */ 139 140#define PAGE_SB_COUNT 2 141#define PAGE_SB_TIME 72h 142#define PAGE_CK_COUNT 2 143#define PAGE_CK_TIME 72h 144 145engine stat.sbpgflt@chip/memory-controller/dimm/rank; 146engine stat.ckpgflt@chip/memory-controller/dimm/rank; 147engine serd.memory.page_sb@chip/memory-controller/dimm/rank, 148 N=PAGE_SB_COUNT, T=PAGE_SB_TIME; 149engine serd.memory.page_ck@chip/memory-controller/dimm/rank, 150 N=PAGE_CK_COUNT, T=PAGE_CK_TIME; 151engine serd.memory.dimm_sb@chip/memory-controller/dimm/rank, 152 N=PAGE_SB_COUNT, T=PAGE_SB_TIME; 153engine serd.memory.dimm_ck@chip/memory-controller/dimm/rank, 154 N=PAGE_CK_COUNT, T=PAGE_CK_TIME; 155event fault.memory.page_sb@chip/memory-controller/dimm/rank, message=0, 156 count=stat.sbpgflt@chip/memory-controller/dimm/rank, response=0, 157 engine=serd.memory.page_sb@chip/memory-controller/dimm/rank; 158event fault.memory.page_ck@chip/memory-controller/dimm/rank, message=0, 159 count=stat.ckpgflt@chip/memory-controller/dimm/rank, response=0, 160 engine=serd.memory.page_ck@chip/memory-controller/dimm/rank; 161event fault.memory.dimm_sb@chip/memory-controller/dimm/rank, 162 engine=serd.memory.dimm_sb@chip/memory-controller/dimm/rank; 163event fault.memory.dimm_ck@chip/memory-controller/dimm/rank, 164 engine=serd.memory.dimm_ck@chip/memory-controller/dimm/rank; 165 166/* 167 * The fraction of pages on a single rank that must be diagnosed as faulty 168 * with single correctable unit faults before we will fault the rank. 169 * Once we have faulted the rank we will continue to diagnose any further page 170 * faults on the rank up to some maximum multiple of the threshold at which 171 * we faulted the dimm. This allows us to potentially contain some fairly 172 * far-reaching but still limited-extent fault (such as a partial column 173 * failure) without getting carried away and allowing a single faulty rank to 174 * use up the entire system-imposed page retirenment limit (which, once 175 * reached, causes retirement request to have no effect other than to fill 176 * the fault manager cache and logs). 177 * 178 * This fraction is specified in basis points, where 100 basis points are 179 * equivalent to 1 percent. It is applied on a per-rank basis. 180 * 181 * The system imposes an absolute maximum on the number of pages it will 182 * retire; the current value is 10 basis points, or 0.1% of 'physmem'. Note 183 * that 'physmem' is reduced from installed memory pages by an amount 184 * reflecting permanent kernel memory allocations. This system page retire 185 * limit bounds the maximum real response to page faults across all ranks 186 * that fault manager response agents can effect, but it should not be confused 187 * with any diagnosis threshold (i.e., the number of faulty pages we are 188 * prepared to tolerate from a single rank before faulting the rank is 189 * distinct from the total number of pages we are prepared to retire from use 190 * in response to that and other faults). It is, however, desirable to 191 * arrange that the maximum number of pages we are prepared to fault from 192 * any one rank is less than the system-wide quota. 193 */ 194#define PAGE_RETIRE_LIMIT_BPS 5 /* or 0.05%; ~ 131 pages/GB %/ 195 196/* 197 * A macro to manipulate the above fraction. Given a size in bytes convert 198 * this to pages (4K pagesize) and calculate the number of those pages 199 * indicated by PAGE_RETIRE_LIMIT_BPS basis points. 200 */ 201#define _BPS_PGCNT(totalbytes) \ 202 ((((totalbytes) / 4096 ) * PAGE_RETIRE_LIMIT_BPS) / 10000) 203 204/* 205 * The single-correctable-unit threshold at which number of faulted pages 206 * on a rank we we fault the rank. We insist that this be at least 128 and 207 * never more than 512. 208 */ 209#define RANK_THRESH MIN(512, MAX(128, \ 210 _BPS_PGCNT(confprop(chip/memory-controller/dimm/rank, "size")))) 211 212/* 213 * The maximum number of single-correctable-unit page faults we will diagnose 214 * on a single rank (must be greater than RANK_THRESH). We set 215 * this at twice the rank fault threshold. 216 */ 217#define RANK_PGFLT_MAX (2 * RANK_THRESH) 218 219#define SB_PGFLTS (count(stat.sbpgflt@chip/memory-controller/dimm/rank)) 220#define CK_PGFLTS (count(stat.ckpgflt@chip/memory-controller/dimm/rank)) 221 222/* 223 * "Single-correctable-unit" DIMM faults are diagnosed when the total number of 224 * page faults (diagnosed from repeated single-bit or multibit-chipkills) 225 * from any one rank on that DIMM reaches a threshold. A "correctable unit" 226 * is a single bit in normal 64/8 ECC mode, or a single symbol in ChipKill 227 * 128/16 mode (i.e., nibble-aligned nibble for the code used on Opteron). 228 * 229 * We do not stop diagnosing further single-bit page faults once we have 230 * declared a single-bit DIMM fault - we continue diagnosing them and 231 * response agents can continue to retire those pages up to the system-imposed 232 * retirement limit. 233 * 234 * Two distinct fault types may be diagnosed - fault.memory.dimm_sb and 235 * fault.memory.dimm_ck. Which one is diagnosed depends on whether we 236 * have reached the threshold for a majority of single-bit page faults or 237 * multibit page faults. 238 * 239 * Implementation: we maintain parallel SERD engines to the page_sb and 240 * page_ck engines, which trip in unison. On trip it generates a distinct 241 * ereport which we diagnose to a fault if the threshold has been reached. 242 */ 243prop fault.memory.page_sb@chip/memory-controller/dimm/rank 244 { CONTAINS_RANK && SINGLE_BIT_CE && 245 SB_PGFLTS + CK_PGFLTS < RANK_PGFLT_MAX && SET_ADDR && SET_OFFSET } (1)-> 246 ereport.cpu.amd.dc.inf_sys_ecc1@chip/core<>/strand<>, 247 ereport.cpu.amd.bu.s_ecc1@chip/core<>/strand<>, 248 ereport.cpu.amd.nb.mem_ce@chip/core<>/strand<>; 249 250prop fault.memory.page_ck@chip/memory-controller/dimm/rank 251 { CONTAINS_RANK && !SINGLE_BIT_CE && 252 SB_PGFLTS + CK_PGFLTS < RANK_PGFLT_MAX && SET_ADDR && SET_OFFSET } (1)-> 253 ereport.cpu.amd.dc.inf_sys_ecc1@chip/core<>/strand<>, 254 ereport.cpu.amd.bu.s_ecc1@chip/core<>/strand<>, 255 ereport.cpu.amd.nb.mem_ce@chip/core<>/strand<>; 256 257prop fault.memory.dimm_sb@chip/memory-controller/dimm/rank 258 { CONTAINS_RANK && SB_PGFLTS + CK_PGFLTS > RANK_THRESH && 259 SB_PGFLTS > RANK_THRESH / 2 } (1)-> 260 ereport.cpu.amd.dc.inf_sys_ecc1@chip/core<>/strand<>, 261 ereport.cpu.amd.bu.s_ecc1@chip/core<>/strand<>, 262 ereport.cpu.amd.nb.mem_ce@chip/core<>/strand<>; 263 264prop fault.memory.dimm_ck@chip/memory-controller/dimm/rank 265 { CONTAINS_RANK && SB_PGFLTS + CK_PGFLTS > RANK_THRESH && 266 CK_PGFLTS > RANK_THRESH / 2 } (1)-> 267 ereport.cpu.amd.nb.mem_ce@chip/core<>/strand<>; 268 269/* 270 * If the address is not valid then no resource member will be included 271 * in a nb.mem_ce or nb.mem_ue ereport. These cases should be rare. 272 * We will also discard all inf_sys_ecc1 events detected at the ic since they 273 * have no syndrome and therefore no resource information. 274 * We will discard such ereports. An alternative may be to SERD them 275 * on a per MC basis and trip if we see too many such events. 276 */ 277event upset.memory.discard1@chip/core/strand; 278prop upset.memory.discard1@chip/core/strand 279 { !RESOURCE_EXISTS } (1)-> 280 ereport.cpu.amd.ic.inf_sys_ecc1@chip/core/strand, 281 ereport.cpu.amd.dc.inf_sys_ecc1@chip/core/strand, 282 ereport.cpu.amd.bu.s_ecc1@chip/core/strand, 283 ereport.cpu.amd.nb.mem_ce@chip/core/strand; 284 285/* #DIMM_UE# 286 * #PAGE_UE# 287 * An uncorrectable multi-bit fault in a memory dimm can cause: 288 * 289 * - mem_ue : reported by nb for an access from a remote cpu 290 * - inf_sys_eccm : reported by ic or dc; the ic does not report a syndrome 291 * - s_eccm : reported by bu 292 * 293 * Since on production systems we force HT Sync Flood on uncorrectable 294 * memory errors (if not already set as such by the BIOS, as it should be) 295 * we won't actually receive these ereports since the system will be reset. 296 */ 297 298event ereport.cpu.amd.ic.inf_sys_eccm@chip/core/strand{within(5s)}; 299event ereport.cpu.amd.dc.inf_sys_eccm@chip/core/strand{within(5s)}; 300event ereport.cpu.amd.bu.s_eccm@chip/core/strand{within(5s)}; 301event ereport.cpu.amd.nb.mem_ue@chip/core/strand{within(5s)}; 302 303event fault.memory.dimm_ue@chip/memory-controller/dimm/rank; 304event fault.memory.page_ue@chip/memory-controller/dimm/rank, message=0, 305 response=0; 306 307prop fault.memory.dimm_ue@chip/memory-controller/dimm/rank 308 { CONTAINS_RANK } (1)-> 309 ereport.cpu.amd.ic.inf_sys_eccm@chip/core<>/strand<>, 310 ereport.cpu.amd.dc.inf_sys_eccm@chip/core<>/strand<>, 311 ereport.cpu.amd.bu.s_eccm@chip/core<>/strand<>, 312 ereport.cpu.amd.nb.mem_ue@chip/core<>/strand<>; 313 314prop fault.memory.page_ue@chip/memory-controller/dimm/rank 315 { CONTAINS_RANK && SET_ADDR && SET_OFFSET } (1)-> 316 ereport.cpu.amd.ic.inf_sys_eccm@chip/core<>/strand<>, 317 ereport.cpu.amd.dc.inf_sys_eccm@chip/core<>/strand<>, 318 ereport.cpu.amd.bu.s_eccm@chip/core<>/strand<>, 319 ereport.cpu.amd.nb.mem_ue@chip/core<>/strand<>; 320 321event upset.memory.discard3@chip/core/strand; 322prop upset.memory.discard3@chip/core/strand 323 { !RESOURCE_EXISTS } (1)-> 324 ereport.cpu.amd.ic.inf_sys_eccm@chip/core/strand, 325 ereport.cpu.amd.dc.inf_sys_eccm@chip/core/strand, 326 ereport.cpu.amd.bu.s_eccm@chip/core/strand, 327 ereport.cpu.amd.nb.mem_ue@chip/core/strand; 328 329/* #CSTESTFAIL# 330 * If the BIOS fails a chip-select during POST, or perhaps after a 331 * sync flood from an uncorrectable error, then on revision F and G it 332 * should mark that chip-select as TestFail in the CS Base register. 333 * When the memory-controller driver discovers all the MC configuration 334 * it notes such failed chip-selects and creates topology nodes for the 335 * chip-select and associated dimms and ranks, and produces an ereport for each 336 * failed chip-select with detector set to the memory-controller node 337 * and resource indicating the failed chip-select. 338 */ 339 340event ereport.cpu.amd.mc.cs_testfail@chip/memory-controller{within(5s)}; 341event fault.memory.dimm_testfail@chip/memory-controller/dimm/rank; 342event error.memory.cs_testfail@chip/memory-controller/chip-select; 343 344#define CONTAINS_CS (payloadprop_contains("resource", \ 345 asru(chip/memory-controller/chip-select))) 346 347prop error.memory.cs_testfail@chip/memory-controller/chip-select (1)-> 348 ereport.cpu.amd.mc.cs_testfail@chip/memory-controller 349 { CONTAINS_CS }; 350 351#define CSMATCH(s) \ 352 (confprop_defined(chip/memory-controller/chip-select, s) && \ 353 confprop(chip/memory-controller/chip-select, s) == \ 354 confprop(chip/memory-controller/dimm/rank, "csname")) 355 356prop fault.memory.dimm_testfail@chip/memory-controller/dimm/rank (0)-> 357 error.memory.cs_testfail@chip/memory-controller/chip-select 358 { CSMATCH("dimm1-csname") || CSMATCH("dimm2-csname")}; 359 360/* #ADDRPAR# 361 * DRAM Command/Address Parity Errors. 362 * 363 * - dramaddr_par : reported by the nb; the NB status register includes 364 * a bit indicating which dram controller channel (A or B) experienced 365 * the error. 366 */ 367 368event ereport.cpu.amd.nb.dramaddr_par@chip/core/strand{within(5s)}; 369event fault.cpu.amd.dramchannel@chip/memory-controller/dram-channel, response=0; 370 371prop fault.cpu.amd.dramchannel@chip/memory-controller/dram-channel[y] (0)-> 372 ereport.cpu.amd.nb.dramaddr_par@chip/core/strand { 373 ((payloadprop("IA32_MCi_STATUS") >> 32 & 0x200) ? 1 : 0) == y }; 374 375/* #L2D_SINGLE# 376 * A single bit data array fault in an l2 cache can cause: 377 * 378 * - inf_l2_ecc1 : reported by ic on this cpu 379 * - inf_l2_ecc1 : reported by dc on this cpu 380 * - l2d_ecc1 : reported by bu on copyback or on snoop from another cpu 381 */ 382 383#define L2CACHEDATA_SB_COUNT 3 384#define L2CACHEDATA_SB_TIME 12h 385 386event ereport.cpu.amd.ic.inf_l2_ecc1@chip/core/strand{within(5s)}; 387event ereport.cpu.amd.dc.inf_l2_ecc1@chip/core/strand{within(5s)}; 388event ereport.cpu.amd.bu.l2d_ecc1@chip/core/strand{within(5s)}; 389engine serd.cpu.amd.l2d_sb@chip/core/strand, 390 N=L2CACHEDATA_SB_COUNT, T=L2CACHEDATA_SB_TIME; 391event fault.cpu.amd.l2cachedata@chip/core/strand, engine=serd.cpu.amd.l2d_sb@chip/core/strand; 392 393prop fault.cpu.amd.l2cachedata@chip/core/strand (0)-> 394 ereport.cpu.amd.ic.inf_l2_ecc1@chip/core/strand, 395 ereport.cpu.amd.dc.inf_l2_ecc1@chip/core/strand, 396 ereport.cpu.amd.bu.l2d_ecc1@chip/core/strand; 397 398/* #L2D_MULTI# 399 * A multi-bit data array fault in an l2 cache can cause: 400 * 401 * - inf_l2_eccm : reported by ic on this cpu 402 * - inf_l2_eccm : reported by dc on this cpu 403 * - l2d_eccm : reported by bu on copyback or on snoop from another cpu 404 */ 405 406event ereport.cpu.amd.ic.inf_l2_eccm@chip/core/strand{within(5s)}; 407event ereport.cpu.amd.dc.inf_l2_eccm@chip/core/strand{within(5s)}; 408event ereport.cpu.amd.bu.l2d_eccm@chip/core/strand{within(5s)}; 409 410prop fault.cpu.amd.l2cachedata@chip/core/strand 411 { setserdincrement(L2CACHEDATA_SB_COUNT + 1) } (0)-> 412 ereport.cpu.amd.ic.inf_l2_eccm@chip/core/strand, 413 ereport.cpu.amd.dc.inf_l2_eccm@chip/core/strand, 414 ereport.cpu.amd.bu.l2d_eccm@chip/core/strand; 415 416/* #L2T_SINGLE# 417 * A single bit tag array fault in an l2 cache can cause: 418 * 419 * - l2t_ecc1 : reported by bu on this cpu when detected during snoop 420 * - l2t_par : reported by bu on this cpu when detected other than during snoop 421 */ 422 423#define L2CACHETAG_SB_COUNT 3 424#define L2CACHETAG_SB_TIME 12h 425 426event ereport.cpu.amd.bu.l2t_ecc1@chip/core/strand{within(5s)}; 427event ereport.cpu.amd.bu.l2t_par@chip/core/strand{within(5s)}; 428engine serd.cpu.amd.l2t_sb@chip/core/strand, 429 N=L2CACHETAG_SB_COUNT, T=L2CACHETAG_SB_TIME; 430event fault.cpu.amd.l2cachetag@chip/core/strand, engine=serd.cpu.amd.l2t_sb@chip/core/strand; 431 432prop fault.cpu.amd.l2cachetag@chip/core/strand (0)-> 433 ereport.cpu.amd.bu.l2t_ecc1@chip/core/strand; 434 435/* #L2T_MULTI# 436 * A multi-bit tag array fault in an l2 cache can cause: 437 * 438 * - l2t_eccm : reported by bu on this cpu when detected during snoop 439 * - l2t_par : reported by bu on this cpu when detected other than during snoop 440 */ 441 442event ereport.cpu.amd.bu.l2t_eccm@chip/core/strand{within(5s)}; 443 444prop fault.cpu.amd.l2cachetag@chip/core/strand 445 { setserdincrement(L2CACHETAG_SB_COUNT + 1) } (0)-> 446 ereport.cpu.amd.bu.l2t_eccm@chip/core/strand, 447 ereport.cpu.amd.bu.l2t_par@chip/core/strand; 448 449/* #ICD_PAR# 450 * A data array parity fault in an I cache can cause: 451 * 452 * - data_par : reported by ic on this cpu 453 */ 454 455#define ICACHEDATA_SB_COUNT 2 456#define ICACHEDATA_SB_TIME 168h 457 458event ereport.cpu.amd.ic.data_par@chip/core/strand{within(5s)}; 459engine serd.cpu.amd.icachedata@chip/core/strand, 460 N=ICACHEDATA_SB_COUNT, T=ICACHEDATA_SB_TIME; 461event fault.cpu.amd.icachedata@chip/core/strand, 462 engine=serd.cpu.amd.icachedata@chip/core/strand; 463 464prop fault.cpu.amd.icachedata@chip/core/strand (0)-> 465 ereport.cpu.amd.ic.data_par@chip/core/strand; 466 467/* #ICT_PAR# 468 * A tag array parity fault in an I cache can cause: 469 * 470 * - tag_par : reported by ic on this cpu 471 */ 472 473#define ICACHETAG_SB_COUNT 2 474#define ICACHETAG_SB_TIME 168h 475 476event ereport.cpu.amd.ic.tag_par@chip/core/strand{within(5s)}; 477engine serd.cpu.amd.icachetag@chip/core/strand, 478 N=ICACHETAG_SB_COUNT, T=ICACHETAG_SB_TIME; 479event fault.cpu.amd.icachetag@chip/core/strand, engine=serd.cpu.amd.icachetag@chip/core/strand; 480 481prop fault.cpu.amd.icachetag@chip/core/strand (0)-> 482 ereport.cpu.amd.ic.tag_par@chip/core/strand; 483 484/* #ICT_SNOOP# 485 * A snoop tag array parity fault in an I cache can cause: 486 * 487 * - stag_par : reported by ic on this cpu 488 */ 489 490event ereport.cpu.amd.ic.stag_par@chip/core/strand{within(5s)}; 491event fault.cpu.amd.icachestag@chip/core/strand; 492 493prop fault.cpu.amd.icachestag@chip/core/strand (1)-> 494 ereport.cpu.amd.ic.stag_par@chip/core/strand; 495 496/* #ICTLB_1# 497 * An l1tlb parity fault in an I cache can cause: 498 * 499 * - l1tlb_par : reported by ic on this cpu 500 */ 501 502#define ICACHEL1TLB_SB_COUNT 2 503#define ICACHEL1TLB_SB_TIME 168h 504 505event ereport.cpu.amd.ic.l1tlb_par@chip/core/strand{within(5s)}; 506engine serd.cpu.amd.l1itlb@chip/core/strand, 507 N=ICACHEL1TLB_SB_COUNT, T=ICACHEL1TLB_SB_TIME; 508event fault.cpu.amd.l1itlb@chip/core/strand, engine=serd.cpu.amd.l1itlb@chip/core/strand; 509 510prop fault.cpu.amd.l1itlb@chip/core/strand (0)-> 511 ereport.cpu.amd.ic.l1tlb_par@chip/core/strand; 512 513/* #ICTLB_2# 514 * An l2tlb parity fault in an I cache can cause: 515 * 516 * - l2tlb_par : reported by ic on this cpu 517 */ 518 519#define ICACHEL2TLB_SB_COUNT 2 520#define ICACHEL2TLB_SB_TIME 168h 521 522event ereport.cpu.amd.ic.l2tlb_par@chip/core/strand{within(5s)}; 523engine serd.cpu.amd.l2itlb@chip/core/strand, 524 N=ICACHEL2TLB_SB_COUNT, T=ICACHEL2TLB_SB_TIME; 525event fault.cpu.amd.l2itlb@chip/core/strand, engine=serd.cpu.amd.l2itlb@chip/core/strand; 526 527prop fault.cpu.amd.l2itlb@chip/core/strand (0)-> 528 ereport.cpu.amd.ic.l2tlb_par@chip/core/strand; 529 530/* #DCD_SINGLE# 531 * A single bit data array fault in an D cache can cause: 532 * 533 * - data_ecc1 : reported by dc on this cpu by scrubber 534 * - data_ecc1_uc : reported by dc on this cpu other than by scrubber 535 * 536 * Make data_ecc1_uc fault immediately as it may have caused a panic, so 537 * it is handled by the multi-bit case in the following section. 538 */ 539 540#define DCACHEDATA_SB_COUNT 2 541#define DCACHEDATA_SB_TIME 168h 542 543event ereport.cpu.amd.dc.data_ecc1@chip/core/strand{within(5s)}; 544event ereport.cpu.amd.dc.data_ecc1_uc@chip/core/strand{within(5s)}; 545engine serd.cpu.amd.dc_sb@chip/core/strand, 546 N=DCACHEDATA_SB_COUNT, T=DCACHEDATA_SB_TIME; 547event fault.cpu.amd.dcachedata@chip/core/strand, engine=serd.cpu.amd.dc_sb@chip/core/strand; 548 549prop fault.cpu.amd.dcachedata@chip/core/strand (0)-> 550 ereport.cpu.amd.dc.data_ecc1@chip/core/strand; 551 552/* #DCD_MULTI# 553 * A multi-bit data array fault in an D cache can cause: 554 * 555 * - data_eccm : reported by dc on this cpu 556 */ 557 558event ereport.cpu.amd.dc.data_eccm@chip/core/strand{within(5s)}; 559 560prop fault.cpu.amd.dcachedata@chip/core/strand 561 { setserdincrement(L2CACHETAG_SB_COUNT + 1) } (0)-> 562 ereport.cpu.amd.dc.data_eccm@chip/core/strand, 563 ereport.cpu.amd.dc.data_ecc1_uc@chip/core/strand; 564 565/* #DCT_PAR# 566 * A tag array parity fault in an D cache can cause: 567 * 568 * - tag_par : reported by dc on this cpu 569 */ 570 571event ereport.cpu.amd.dc.tag_par@chip/core/strand{within(5s)}; 572event fault.cpu.amd.dcachetag@chip/core/strand; 573 574prop fault.cpu.amd.dcachetag@chip/core/strand (1)-> 575 ereport.cpu.amd.dc.tag_par@chip/core/strand; 576 577/* #DCT_SNOOP# 578 * A snoop tag array parity fault in an D cache can cause: 579 * 580 * - stag_par : reported by dc on this cpu 581 */ 582 583event ereport.cpu.amd.dc.stag_par@chip/core/strand{within(5s)}; 584event fault.cpu.amd.dcachestag@chip/core/strand; 585 586prop fault.cpu.amd.dcachestag@chip/core/strand (1)-> 587 ereport.cpu.amd.dc.stag_par@chip/core/strand; 588 589/* #DCTLB_1# 590 * An l1tlb parity fault in an D cache can cause: 591 * 592 * - l1tlb_par : reported by dc on this cpu 593 */ 594 595event ereport.cpu.amd.dc.l1tlb_par@chip/core/strand{within(5s)}; 596event fault.cpu.amd.l1dtlb@chip/core/strand; 597 598prop fault.cpu.amd.l1dtlb@chip/core/strand (1)-> 599 ereport.cpu.amd.dc.l1tlb_par@chip/core/strand; 600 601/* #DCTLB_2# 602 * An l2tlb parity fault in an D cache can cause: 603 * 604 * - l2tlb_par : reported by dc on this cpu 605 */ 606 607event ereport.cpu.amd.dc.l2tlb_par@chip/core/strand{within(5s)}; 608event fault.cpu.amd.l2dtlb@chip/core/strand; 609 610prop fault.cpu.amd.l2dtlb@chip/core/strand (1)-> 611 ereport.cpu.amd.dc.l2tlb_par@chip/core/strand; 612 613/* #MISC# 614 * Ereports that should not normally happen and which we will discard 615 * without diagnosis if they do. These fall into a few categories: 616 * 617 * - the corresponding detector is not enabled, typically because 618 * detection/handling of the event is taking place elsewhere 619 * (nb.ma, nb.ta, ls.rde, ic.rdde, bu.s_rde, nb.gart_walk) 620 * - the event is associated with a sync flood so even if the detector is 621 * enabled we will never handle the event and generate an ereport *and* 622 * even if the ereport did arrive we could perform no useful diagnosis 623 * e.g., the NB can be configured for sync flood on nb.mem_eccm 624 * but we don't choose to discard that ereport here since we could have 625 * made a useful diagnosis from it had it been delivered 626 * (nb.ht_sync, nb.ht_crc) 627 * - events that will be accompanied by an immediate panic and 628 * delivery of the ereport during subsequent reboot but from 629 * which no useful diagnosis can be made. (nb.rmw, nb.wdog) 630 * 631 * Ereports for all of these can be generated by error simulation and 632 * injection. We will perform a null diagnosos of all these ereports in order 633 * to avoid "no subscription" complaints during test harness runs. 634 */ 635 636event ereport.cpu.amd.nb.ma@strand{within(5s)}; 637event ereport.cpu.amd.nb.ta@strand{within(5s)}; 638event ereport.cpu.amd.ls.s_rde@strand{within(5s)}; 639event ereport.cpu.amd.ic.rdde@strand{within(5s)}; 640event ereport.cpu.amd.bu.s_rde@strand{within(5s)}; 641event ereport.cpu.amd.nb.gart_walk@strand{within(5s)}; 642event ereport.cpu.amd.nb.ht_sync@strand{within(5s)}; 643event ereport.cpu.amd.nb.ht_crc@strand{within(5s)}; 644event ereport.cpu.amd.nb.rmw@strand{within(5s)}; 645event ereport.cpu.amd.nb.wdog@strand{within(5s)}; 646event ereport.cpu.amd.unknown@strand{within(5s)}; 647 648event upset.null_diag@strand; 649 650prop upset.null_diag@strand (1)-> 651 ereport.cpu.amd.nb.ma@strand, 652 ereport.cpu.amd.nb.ta@strand, 653 ereport.cpu.amd.ls.s_rde@strand, 654 ereport.cpu.amd.ic.rdde@strand, 655 ereport.cpu.amd.bu.s_rde@strand, 656 ereport.cpu.amd.nb.gart_walk@strand, 657 ereport.cpu.amd.nb.ht_sync@strand, 658 ereport.cpu.amd.nb.ht_crc@strand, 659 ereport.cpu.amd.nb.rmw@strand, 660 ereport.cpu.amd.nb.wdog@strand, 661 ereport.cpu.amd.unknown@strand; 662