diff --git a/Makefile b/Makefile index 3f25b0c..2bc59d4 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ .PHONY: clean distclean all force_update .DELETE_ON_ERROR: $(DOC).pdf -XELATEX=xelatex -shell-escape -interaction=nonstopmode +XELATEX=xelatex -shell-escape -halt-on-error -interaction=nonstopmode DOC=hardware_init_review all: $(DOC).pdf diff --git a/bibliographie.bib b/bibliographie.bib index 846e79c..55255b4 100644 --- a/bibliographie.bib +++ b/bibliographie.bib @@ -56,10 +56,21 @@ note = "[Online; accessed 7-May-2024]" @inbook{BKDG, author = {AMD}, +institution = {Advanced Micro Devices, Inc.}, number = {42301}, year = {2013}, month = {01}, -title = {BIOS and Kernel Developer’s Guide (BKDG) for AMD Family 15h Models 00h-0Fh Processors Rev 3.14} +title = {BIOS and Kernel Developer’s Guide (BKDG) for AMD Family 15h Models 00h-0Fh Processors Rev 3.14}, + +} + +@techreport{amd_fam15h_revision_guide, +author = {AMD}, +institution = {Advanced Micro Devices, Inc.}, +number = {48931}, +year = {2013}, +month = {05}, +title = {Revision Guide for AMD Family 15h Models 00h-0Fh Rev 3.10} } @inbook{SR5690BDG, @@ -1221,4 +1232,4 @@ note = "[Online; accessed 17-August-2024]" year = {2019}, note = {Accessed: 2024-08-24}, url = {https://review.coreboot.org/plugins/gitiles/coreboot/+/refs/tags/4.11} -} +} \ No newline at end of file diff --git a/hardware_init_review.bbl b/hardware_init_review.bbl index 746dea1..4efaf52 100644 --- a/hardware_init_review.bbl +++ b/hardware_init_review.bbl @@ -222,6 +222,9 @@ family={AMD}, familyi={A\bibinitperiod}}}% } + \list{institution}{1}{% + {Advanced Micro Devices, Inc.}% + } \strng{namehash}{48af4341f745163f945fa838eeabb062} \strng{fullhash}{48af4341f745163f945fa838eeabb062} \strng{bibnamehash}{48af4341f745163f945fa838eeabb062} @@ -266,6 +269,32 @@ \verb https://developer.amd.com/ \endverb \endentry + \entry{amd_fam15h_revision_guide}{report}{} + \name{author}{1}{}{% + {{hash=48af4341f745163f945fa838eeabb062}{% + family={AMD}, + familyi={A\bibinitperiod}}}% + } + \list{institution}{1}{% + {Advanced Micro Devices, Inc.}% + } + \strng{namehash}{48af4341f745163f945fa838eeabb062} + \strng{fullhash}{48af4341f745163f945fa838eeabb062} + \strng{bibnamehash}{48af4341f745163f945fa838eeabb062} + \strng{authorbibnamehash}{48af4341f745163f945fa838eeabb062} + \strng{authornamehash}{48af4341f745163f945fa838eeabb062} + \strng{authorfullhash}{48af4341f745163f945fa838eeabb062} + \field{extraname}{6} + \field{sortinit}{A} + \field{sortinithash}{2f401846e2029bad6b3ecc16d50031e2} + \field{labelnamesource}{author} + \field{labeltitlesource}{title} + \field{month}{05} + \field{number}{48931} + \field{title}{Revision Guide for AMD Family 15h Models 00h-0Fh Rev 3.10} + \field{type}{techreport} + \field{year}{2013} + \endentry \entry{SR5690BDG}{inbook}{} \name{author}{1}{}{% {{hash=48af4341f745163f945fa838eeabb062}{% @@ -278,7 +307,7 @@ \strng{authorbibnamehash}{48af4341f745163f945fa838eeabb062} \strng{authornamehash}{48af4341f745163f945fa838eeabb062} \strng{authorfullhash}{48af4341f745163f945fa838eeabb062} - \field{extraname}{6} + \field{extraname}{7} \field{sortinit}{A} \field{sortinithash}{2f401846e2029bad6b3ecc16d50031e2} \field{labelnamesource}{author} @@ -300,7 +329,7 @@ \strng{authorbibnamehash}{48af4341f745163f945fa838eeabb062} \strng{authornamehash}{48af4341f745163f945fa838eeabb062} \strng{authorfullhash}{48af4341f745163f945fa838eeabb062} - \field{extraname}{7} + \field{extraname}{8} \field{sortinit}{A} \field{sortinithash}{2f401846e2029bad6b3ecc16d50031e2} \field{labelnamesource}{author} diff --git a/hardware_init_review.pdf b/hardware_init_review.pdf index 124166f..e90dbb1 100644 Binary files a/hardware_init_review.pdf and b/hardware_init_review.pdf differ diff --git a/hardware_init_review.tex b/hardware_init_review.tex index 521ebf5..5498a3d 100644 --- a/hardware_init_review.tex +++ b/hardware_init_review.tex @@ -1637,8 +1637,9 @@ Thanks, I guess ? (TODO) calibration is critical for maintaining signal integrity under different operating conditions, such as voltage and temperature changes. During initialization, the memory controller issues a - ZQCL command, triggering the calibration sequence that optimizes - impedance settings. This ensures that the memory system can + ZQCL command to the DRAM modules, triggering the calibration + sequence that optimizes impedance settings. + This ensures that the memory system can operate with tight timing tolerances, which is crucial for systems requiring high reliability. Read training is also essential to ensure that data read from @@ -1648,6 +1649,16 @@ Thanks, I guess ? (TODO) received. Proper read training is necessary for reliable data retrieval, which directly impacts system performance and stability. \\ + ZQCS (ZQ Calibration Short) however is a procedure used + to periodically adjust the DRAM's ODT and output driver impedance + during normal operation. Unlike the full ZQCL (ZQ Calibration Long), + which is performed during initial memory initialization, ZQCS is a + quicker, less comprehensive calibration that fine-tunes the + impedance settings in response to changes in temperature, voltage, + or other environmental factors. This helps maintain optimal signal + integrity and performance throughout the memory's operation without + the need for a full recalibration. \\ + In summary, the DDR3 memory initialization process in systems like the ASUS KGPE-D16 involves a series of detailed and interdependent steps that are critical for ensuring system @@ -1779,38 +1790,56 @@ Thanks, I guess ? (TODO) memory module is correctly synchronized with the memory controller, minimizing timing mismatches that could lead to data corruption. \\ + Write leveling implies to perform a DQS position training, a + specific form of training focused on aligning the DQS signal with + the data (DQ) signals during write operations. In this process, + the memory controller adjusts the phase of the DQS signal to ensure + that it is correctly aligned with the data signals across all data + lanes, centering the DQS signal within the "data eye" for optimal + timing. This ensures that all data bits are written correctly and + consistently across the memory module, reducing the risk of timing + errors and data corruption. Additionally, DQS receiver training is + also needed to ensure that the memory controller can correctly + capture the DQS signal during read operations + \cite{micron_ddr3}. + The core operation is to make the MCT send out specific test + patterns to the DRAM to determine the timing relationship between + the DQS and data signals, then the MCT adjusts the delay or phase of + the DQS signal relative to the clock signal (CK) and the data + signals (DQ) while checking the integrity of the test data in the + DRAM. \\ + Using seed-based algorithms, the memory controller sets an initial delay value and then iteratively adjusts it based on the feedback received from the memory module. This process ensures that the DQS signal is correctly aligned with the CK signal at the memory module's pins, minimizing the risk of data corruption and ensuring - reliable write operations - \cite{samsung_ddr3}\cite{gopikrishna2021novel}. + reliable write operations \cite{samsung_ddr3}\cite{gopikrishna2021novel}. Seed-based write leveling offers improved precision but must be finely tuned to account for the specific characteristics of the memory module and the overall system architecture \cite{gopikrishna2021novel}. \\ - In contrast to seed-based algorithms, seedless methods - do not rely on an initial reference value. Instead, they - dynamically adjust the impedance and timing parameters during - the calibration process. Seedless ZQ calibration continuously - monitors the impedance of the memory module and makes real-time - adjustments to maintain optimal matching. This approach can be - beneficial in environments where the operating conditions are - highly variable, as it allows for more flexible and adaptive - calibration \cite{kim2010design}. Similarly, seedless write - leveling dynamically adjusts the DQS timing based on real-time - feedback from the memory module. This method is particularly - useful in systems where the memory configuration is frequently - changed or where the operating conditions vary significantly - \cite{micron_ddr3}\cite{gopikrishna2021novel}. The traditional - ZQ calibration methods, while effective, often struggle with - matching impedance perfectly across all conditions. A master - thesis by \textcite{gopikrishna2021novel} builds upon these - traditional methods by proposing enhancements that involve more - sophisticated calibration approaches, leading to better impedance - matching and overall memory performance \cite{gopikrishna2021novel}. + In contrast to seed-based algorithms, seedless methods do not rely on + an initial reference value. Instead, they dynamically adjust the + impedance and timing parameters during the calibration process. + Seedless ZQ calibration continuously monitors the impedance of the + memory module and makes real-time adjustments to maintain optimal + matching. This approach can be beneficial in environments where the + operating conditions are highly variable, as it allows for more + flexible and adaptive calibration \cite{kim2010design}. Similarly, + seedless write leveling dynamically adjusts the DQS timing based on + real-time feedback from the memory module. This method is particularly + useful in systems where the memory configuration is frequently changed + or where the operating conditions vary significantly + \cite{micron_ddr3}\cite{gopikrishna2021novel}. The traditional ZQ + calibration methods, while effective, often struggle with matching + impedance perfectly across all conditions. A master thesis by + \textcite{gopikrishna2021novel} builds upon these traditional methods + by proposing enhancements that involve more sophisticated calibration + approaches, leading to better impedance matching and overall memory + performance \cite{gopikrishna2021novel}. + \subsection{BIOS and Kernel Developer Guide (BKDG) recommendations} @@ -1965,8 +1994,8 @@ Thanks, I guess ? (TODO) \subsubsection{Write leveling process} - The BIOS and Kernel Developer Guide (BKDG) provides a - comprehensive approach to the write leveling process, which is + The BIOS and Kernel Developer Guide (BKDG) provides + information on the write leveling process, which is essential for ensuring correct data alignment during write operations in DDR3 memory systems. Write leveling is particularly crucial in systems utilizing a fly-by topology, @@ -2008,7 +2037,14 @@ Thanks, I guess ? (TODO) which is particularly important in systems with multiple DIMMs. The steps common to both types include a preparation with the DDR3 Mode Register Commands - (see fig. \ref{fig:ddr3_state_machine}). Mode registers in DDR3 + (see fig. \ref{fig:ddr3_state_machine}). + For RDIMMs, a 4-rank module is treated as two + separate DIMMs, where each rank is essentially a separate memory + module within the same DIMM. The first two ranks are the primary + target for the initial configuration. The remaining two ranks + are treated as non-target and are configured separately. \\ + + Mode registers in DDR3 memory are used to configure various operational parameters such as latency settings, burst length, and write leveling. One of the key mode registers is \path{MR1_dct}, which is specific to @@ -2018,11 +2054,7 @@ Thanks, I guess ? (TODO) driver settings. The \path{dct} suffix refers to the Data Control Timing that is specific to this register's function in managing the timing and control of data operations within the - memory module. For RDIMMs, a 4-rank module is treated as two - separate DIMMs, where each rank is essentially a separate memory - module within the same DIMM. The first two ranks are the primary - target for the initial configuration. The remaining two ranks - are treated as non-target and are configured separately. \\ + memory module. \\ Then, these steps are followed, still common to both RDIMMs and UDIMMs: @@ -2300,7 +2332,7 @@ Thanks, I guess ? (TODO) \end{adjustwidth} \caption{ Preparing SMBus, DCTs and NB in - \protect\path{mctAutoInitMCT_D()} + \protect\path{mctAutoInitMCT_D()} from \protect\path{src/northbridge/amd/amdmct/mct_ddr3/mct_d.c}} \label{lst:mctAutoInitMCT_D_4} \end{listing} @@ -2319,7 +2351,7 @@ Thanks, I guess ? (TODO) \end{adjustwidth} \caption{ Get DQS, reset and activate ECC in - \protect\path{mctAutoInitMCT_D()} + \protect\path{mctAutoInitMCT_D()} from \protect\path{src/northbridge/amd/amdmct/mct_ddr3/mct_d.c}} \label{lst:mctAutoInitMCT_D_5} \end{listing} @@ -2337,7 +2369,7 @@ Thanks, I guess ? (TODO) \caption{ Mapping DRAM with cache, validating DCT nodes and finishing the init process in - \protect\path{mctAutoInitMCT_D()} + \protect\path{mctAutoInitMCT_D()} from \protect\path{src/northbridge/amd/amdmct/mct_ddr3/mct_d.c}} \label{lst:mctAutoInitMCT_D_6} \end{listing} @@ -2362,31 +2394,32 @@ Thanks, I guess ? (TODO) includes an early exit condition to bypass DQS training if a specific status flag (\path{GSB_EnDIMMSpareNW}) is set, indicating that a DIMM spare feature is enabled - (lst. \ref{lst:var_decl_and_exit}). \\ + (lst. \ref{lst:var_decl_and_exit}). These spare DIMMs are not + used for normal memory operations but are kept in reserve for + redundancy. \\ - \begin{listing} + \begin{listing}[H] \begin{adjustwidth}{0.5cm}{0.5cm} \begin{minted}[linenos]{c} -uint8_t Node; -u8 nv_DQSTrainCTL; -uint8_t retry_requested; - if (pMCTstat->GStatus & (1 << GSB_EnDIMMSpareNW)) { return; } \end{minted} \end{adjustwidth} - \caption{Initial variable declarations and early exit check.} + \caption{Early exit check, + extract from the + \protect\path{DQSTiming_D} function in + \protect\path{src/northbridge/amd/amdmct/mct_ddr3/mct_d.c}} \label{lst:var_decl_and_exit} \end{listing} Next, the function initializes the TCWL (CAS Write Latency) - offset to zero for each node and DCT (DRAM Controller Timing). + offset to zero for each node and DCT. This ensures that the memory write latency is properly aligned before the DQS training begins (lst. \ref{lst:set_tcwl_offset}). \\ - \begin{listing} + \begin{listing}[H] \begin{adjustwidth}{0.5cm}{0.5cm} \begin{minted}[linenos]{c} for (Node = 0; Node < MAX_NODES_SUPPORTED; Node++) { @@ -2406,17 +2439,10 @@ for (Node = 0; Node < MAX_NODES_SUPPORTED; Node++) { \end{listing} A retry mechanism is introduced to handle potential errors - during DQS training. The \path{nv_DQSTrainCTL} variable is - set based on the \path{allow_config_restore} parameter, - determining whether to restore a previous configuration or - proceed with fresh training, but non-working on the current - implementation of ASUS KGPE-D16 - (lst. \ref{lst:mctAutoInitMCT_D_fixme}). \\ - - Then, the pre-training function are called + during DQS training and the pre-training function are called (lst. \ref{lst:retry_pre_training}). \\ - \begin{listing} + \begin{listing}[H] \begin{adjustwidth}{0.5cm}{0.5cm} \begin{minted}[linenos]{c} retry_dqs_training_and_levelization: @@ -2434,30 +2460,27 @@ retry_dqs_training_and_levelization: \end{listing} For AMD's Fam15h processors, additional PHY compensation is - performed for each node and valid DCT + needed for each node and valid DCT (lst. \ref{lst:phy_compensation_init}). This is necessary to fine-tune the electrical characteristics of the memory interface. For more information about the PHY training, see the earlier sections about RAM training algorithm. \\ - \begin{listing} + \begin{listing}[H] \begin{adjustwidth}{0.5cm}{0.5cm} \begin{minted}[linenos]{c} -if (is_fam15h()) { - struct DCTStatStruc *pDCTstat; - for (Node = 0; Node < MAX_NODES_SUPPORTED; Node++) { - pDCTstat = pDCTstatA + Node; - if (pDCTstat->NodePresent) { - if (pDCTstat->DIMMValidDCT[0]) - InitPhyCompensation(pMCTstat, pDCTstat, 0); - if (pDCTstat->DIMMValidDCT[1]) - InitPhyCompensation(pMCTstat, pDCTstat, 1); - } +for (Node = 0; Node < MAX_NODES_SUPPORTED; Node++) { + pDCTstat = pDCTstatA + Node; + if (pDCTstat->NodePresent) { + if (pDCTstat->DIMMValidDCT[0]) + InitPhyCompensation(pMCTstat, pDCTstat, 0); + if (pDCTstat->DIMMValidDCT[1]) + InitPhyCompensation(pMCTstat, pDCTstat, 1); } } \end{minted} \end{adjustwidth} - \caption{Family-specific PHY compensation initialization for Fam15h processors, + \caption{PHY compensation initialization, extract from the \protect\path{DQSTiming_D} function in \protect\path{src/northbridge/amd/amdmct/mct_ddr3/mct_d.c}} @@ -2466,36 +2489,57 @@ if (is_fam15h()) { Before proceeding with the main DQS training, the function invokes a hook function that allows for additional - configurations or custom operations. \\ + configurations or custom operations: + \path{mctHookBeforeAnyTraining}. \\ + The \path{nv_DQSTrainCTL} variable is + set based on the \path{allow_config_restore} parameter, + determining whether to restore a previous configuration or + proceed with fresh training. This is however not working on the + current implementation of ASUS KGPE-D16 firmware + (lst. \ref{lst:mctAutoInitMCT_D_fixme}). If \path{nv_DQSTrainCTL} indicates that fresh training should proceed, the function performs the main DQS training in multiple passes, including receiver enable training with - \path{TrainReceiverEn_D} and DQS position - training with \path{mct_TrainDQSPos_D} - (lst. \ref{dqs_training_process}). The process is - repeated in different modes to achieve optimal timing. \\ + \path{TrainReceiverEn_D}, write leveling with + \path{mct_WriteLevelization_HW}, DQS position + training with \path{mct_TrainDQSPos_D} and the maximum read + latency calculation with \path{TrainMaxRdLatency_En_D} + (lst. \ref{lst:dqs_training_process}). + Write leveling is done in two passes, with a DQS receiver + training between and another pass of receiver training after. + After that, a DQS position training is done and the process + finished with the maximum read latency, i.e the delay between + the request for data and the delivery of that data by the DRAM. + \\ - \begin{listing} + \begin{listing}[H] \begin{adjustwidth}{0.5cm}{0.5cm} \begin{minted}[linenos]{c} if (nv_DQSTrainCTL) { mct_WriteLevelization_HW(pMCTstat, pDCTstatA, FirstPass); - - if (is_fam15h()) { - TrainReceiverEn_D(pMCTstat, pDCTstatA, FirstPass); - } - + TrainReceiverEn_D(pMCTstat, pDCTstatA, FirstPass); mct_WriteLevelization_HW(pMCTstat, pDCTstatA, SecondPass); - if (is_fam15h()) { - TrainReceiverEn_D(pMCTstat, pDCTstatA, FirstPass); - } else { - TrainReceiverEn_D(pMCTstat, pDCTstatA, FirstPass); - } - + /* TODO: Determine why running TrainReceiverEn_D in SecondPass mode yields + * less stable training values than when run in FirstPass mode as in the HACK + * below.*/ + TrainReceiverEn_D(pMCTstat, pDCTstatA, FirstPass); mct_TrainDQSPos_D(pMCTstat, pDCTstatA); [...] + TrainMaxRdLatency_En_D(pMCTstat, pDCTstatA); +} else { + mct_WriteLevelization_HW(pMCTstat, pDCTstatA, FirstPass); + mct_WriteLevelization_HW(pMCTstat, pDCTstatA, SecondPass); +#if CONFIG(HAVE_ACPI_RESUME) + printk(BIOS_DEBUG, "mctAutoInitMCT_D: Restoring DIMM training configuration" + "from NVRAM\n"); + if (restore_mct_information_from_nvram(1) != 0) + printk(BIOS_CRIT, "%s: ERROR: Unable to restore DCT configuration from" + "NVRAM\n", __func__); +#endif + exit_training_mode_fam15(pMCTstat, pDCTstatA); + pMCTstat->GStatus |= 1 << GSB_ConfigRestored;" } \end{minted} \end{adjustwidth} @@ -2506,12 +2550,7 @@ if (nv_DQSTrainCTL) { \label{lst:dqs_training_process} \end{listing} - The function checks for any errors during the DQS training. If - errors are detected, it may request a retrain, reset certain - parameters, and restart the training process and even restart - the whole system if needed (lst. \ref{lst:error_handling}). \\ - - \begin{listing} + \begin{listing}[H] \begin{adjustwidth}{0.5cm}{0.5cm} \begin{minted}[linenos]{c} retry_requested = 0; @@ -2534,6 +2573,37 @@ for (Node = 0; Node < MAX_NODES_SUPPORTED; Node++) { } } } + +if (retry_requested) { + printk(BIOS_DEBUG, "%s: Restarting training on algorithm request\n", + __func__); + /* Reset frequency to minimum */ + for (Node = 0; Node < MAX_NODES_SUPPORTED; Node++) { + struct DCTStatStruc *pDCTstat; + pDCTstat = pDCTstatA + Node; + if (pDCTstat->NodePresent) { + uint8_t original_target_freq = pDCTstat->TargetFreq; + uint8_t original_auto_speed = pDCTstat->DIMMAutoSpeed; + pDCTstat->TargetFreq = mhz_to_memclk_config(mctGet_NVbits(NV_MIN_MEMCLK)); + pDCTstat->Speed = pDCTstat->DIMMAutoSpeed = pDCTstat->TargetFreq; + SetTargetFreq(pMCTstat, pDCTstatA, Node); + pDCTstat->TargetFreq = original_target_freq; + pDCTstat->DIMMAutoSpeed = original_auto_speed; + } + } + /* Apply any DIMM timing changes */ + for (Node = 0; Node < MAX_NODES_SUPPORTED; Node++) { + struct DCTStatStruc *pDCTstat; + pDCTstat = pDCTstatA + Node; + if (pDCTstat->NodePresent) { + AutoCycTiming_D(pMCTstat, pDCTstat, 0); + if (!pDCTstat->GangedMode) + if (pDCTstat->DIMMValidDCT[1] > 0) + AutoCycTiming_D(pMCTstat, pDCTstat, 1); + } + } + goto retry_dqs_training_and_levelization; +} \end{minted} \end{adjustwidth} \caption{Error detection and retry mechanism during DQS training, @@ -2543,109 +2613,1334 @@ for (Node = 0; Node < MAX_NODES_SUPPORTED; Node++) { \label{lst:error_handling} \end{listing} + The function checks for any errors during the DQS training. If + errors are detected, it may request a retrain, reset certain + parameters, and restart the training process and even restart + the whole system if needed (lst. \ref{lst:error_handling}). + If the training process it to be restarted, the firmware + sets the DIMMs frequencies to minimum and applies timing changes + to DIMMs before jumping to the retry label + (lst. \ref{lst:retry_pre_training}). \\ + Once the training is successfully completed without errors, the function finalizes the process by setting the maximum read latency and exiting the training mode. For systems with \path{allow_config_restore} enabled, it restores the previous configuration from NVRAM instead of performing a fresh training - (lst. \ref{lst:finalization_exit}). \\ - - \begin{listing} - \begin{adjustwidth}{0.5cm}{0.5cm} - \begin{minted}[linenos]{c} -TrainMaxRdLatency_En_D(pMCTstat, pDCTstatA); - -if (is_fam15h()) - exit_training_mode_fam15(pMCTstat, pDCTstatA); -else - mctSetEccDQSRcvrEn_D(pMCTstat, pDCTstatA); -} else { - mct_WriteLevelization_HW(pMCTstat, pDCTstatA, FirstPass); - mct_WriteLevelization_HW(pMCTstat, pDCTstatA, SecondPass); - -#if CONFIG(HAVE_ACPI_RESUME) - printk(BIOS_DEBUG, "mctAutoInitMCT_D: Restoring DIMM training configuration from NVRAM\n"); - if (restore_mct_information_from_nvram(1) != 0) - printk(BIOS_CRIT, "%s: ERROR: Unable to restore DCT configuration from NVRAM\n", __func__); -#endif - - if (is_fam15h()) - exit_training_mode_fam15(pMCTstat, pDCTstatA); - - pMCTstat->GStatus |= 1 << GSB_ConfigRestored; -} - \end{minted} - \end{adjustwidth} - \caption{Finalization of DQS training and configuration restoration, - extract from the - \protect\path{DQSTiming_D} function in - \protect\path{src/northbridge/amd/amdmct/mct_ddr3/mct_d.c}} - \label{lst:finalization_exit} - \end{listing} + (lst. \ref{lst:dqs_training_process}). \\ Finally, the function performs a cleanup operation specific to Fam15h processors, where it switches the DCT control register as required by a known erratum from AMD for the BKDG - (Erratum 505). This is followed by a post-training hook that + (Erratum 505) \cite{amd_fam15h_revision_guide}. + This is followed by a post-training hook that allows for any additional necessary actions (lst. \ref{lst:post_training_cleanup}). \\ + \begin{listing}[htpb] + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +for (Node = 0; Node < MAX_NODES_SUPPORTED; Node++) { + pDCTstat = pDCTstatA + Node; + if (pDCTstat->NodePresent) { + fam15h_switch_dct(pDCTstat->dev_map, 0); + } +} + +/* FIXME - currently uses calculated value + * TrainMaxReadLatency_D(pMCTstat, pDCTstatA); */ +mctHookAfterAnyTraining(); + \end{minted} + \end{adjustwidth} + \caption{Post-training cleanup and final hook execution, + extract from the + \protect\path{DQSTiming_D} function in + \protect\path{src/northbridge/amd/amdmct/mct_ddr3/mct_d.c}} + \label{lst:post_training_cleanup} + \end{listing} + + \subsubsection{Details on the write leveling implementation} + + The \path{WriteLevelization_HW} function is responsible for + performing hardware-level write leveling on DRAM modules during + the memory initialization process. Write leveling ensures that + the DQS signals are correctly aligned with the clock signals, + preventing timing mismatches during write operations. \\ + + The function begins by initializing pointers to key data + structures, linking the memory controller (MCT) and DRAM + controller timing (DCT) data for subsequent operations. \\ + + Auto-refresh and short ZQ calibration are temporarily disabled + to prevent interference during the critical timing adjustments + of write leveling. + The memory controller is prepared for write leveling by + configuring necessary parameters with \path{PrepareC_MCT}, + then the main operation can begin. \\ + + In the first pass (lst. \ref{lst:write_level_first_pass}), + the function repeatedly attempts to align + the DQS signals with \path{PhyWLPass1}, retrying if invalid + values are detected. This phase ensures basic alignment for + further fine-tuning. The function retries up to 8 times if it + detects invalid timing values. \\ + + During the second pass (lst. \ref{lst:write_level_second_pass}), + the function first checks if the target memory frequency + (\path{TargetFreq}) is higher than the minimum memory clock + frequency stored in the non-volatile bits + (\path{NV_MIN_MEMCLK}). If so, the memory frequency is + incrementally adjusted toward the final target f requency. + This step-by-step approach is crucial, especially for AMD Fam15h + processors, where the frequency must be gradually stepped up to + avoid instability. \\ + + For each frequency step, the write leveling process is + recalibrated by invoking the \path{PhyWLPass2} function. This + function adjusts the DQS timing for each data channel (DCT) and + validates the results. The function retries up to 8 times if it + detects invalid timing values. The global status + (\path{global_phy_training_status}) aggregates the results of + each step, tracking any persistent issues. \\ + + The \path{PhyWLPass1} and \path{PhyWLPass1} function relyon + \path{AgesaHwWlPhase1}, \path{AgesaHwWlPhase2} and + \path{AgesaHwWlPhase3} for this. \\ + + Once the target frequency is reached and all write leveling + adjustments are made, the final timing values are stored. + The gross and fine delays from the previous passes are copied + into the final pass structures. This ensures that the DQS + timings are consistent and stable across all data channels. \\ + + If any issues persist after retries, the function logs a + warning. This indicates that the system may continue to operate, + but with a potential risk of instability due to imperfect + write leveling calibration. \\ + + After leveling, the function re-enables auto-refresh and short + ZQ calibration, ensuring the memory subsystem is correctly + configured for normal operation. \\ + + \begin{listing}[htpb] + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +if (Pass == FirstPass) { + timeout = 0; + do { + status = 0; + timeout++; + status |= PhyWLPass1(pMCTstat, pDCTstat, 0); + status |= PhyWLPass1(pMCTstat, pDCTstat, 1); + if (status) + printk(BIOS_INFO, "%s: Retrying write levelling due to invalid " + "value(s) detected in first phase\n", __func__); + } while (status && (timeout < 8)); + if (status) + printk(BIOS_INFO, "%s: Uncorrectable invalid value(s) detected in first " + "phase of write levelling\n", __func__); +} + \end{minted} + \end{adjustwidth} + \caption{Write leveling (first pass), + extract from the + \protect\path{WriteLevelization_HW} function in + \protect\path{src/northbridge/amd/amdmct/mct_ddr3/mcthwl.c}} + \label{lst:write_level_first_pass} + \end{listing} + + \begin{listing}[H] + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +if (Pass == SecondPass) { + if (pDCTstat->TargetFreq > mhz_to_memclk_config(mctGet_NVbits(NV_MIN_MEMCLK))) { + uint8_t global_phy_training_status = 0; + final_target_freq = pDCTstat->TargetFreq; + + while (pDCTstat->Speed != final_target_freq) { + if (is_fam15h()) + pDCTstat->TargetFreq = + fam15h_next_highest_memclk_freq(pDCTstat->Speed); + else + pDCTstat->TargetFreq = final_target_freq; + SetTargetFreq(pMCTstat, pDCTstatA, Node); + timeout = 0; + do { + status = 0; + timeout++; + status |= PhyWLPass2(pMCTstat, pDCTstat, 0, + (pDCTstat->TargetFreq == final_target_freq)); + status |= PhyWLPass2(pMCTstat, pDCTstat, 1, + (pDCTstat->TargetFreq == final_target_freq)); + if (status) + printk(BIOS_INFO, + "%s: Retrying write levelling due to invalid value(s) " + "detected in last phase\n", + __func__); + } while (status && (timeout < 8)); + global_phy_training_status |= status; + } + + pDCTstat->TargetFreq = final_target_freq; + + if (global_phy_training_status) + printk(BIOS_WARNING, + "%s: Uncorrectable invalid value(s) detected in second phase of " + "write levelling; " + "continuing but system may be unstable!\n", + __func__); + + uint8_t dct; + for (dct = 0; dct < 2; dct++) { + sDCTStruct *pDCTData = pDCTstat->C_DCTPtr[dct]; + memcpy(pDCTData->WLGrossDelayFinalPass, + pDCTData->WLGrossDelayPrevPass, + sizeof(pDCTData->WLGrossDelayPrevPass)); + memcpy(pDCTData->WLFineDelayFinalPass, + pDCTData->WLFineDelayPrevPass, + sizeof(pDCTData->WLFineDelayPrevPass)); + pDCTData->WLCriticalGrossDelayFinalPass = + pDCTData->WLCriticalGrossDelayPrevPass; + } + } +} + \end{minted} + \end{adjustwidth} + \caption{Write Leveling (second pass), extract from the + \texttt{WriteLevelization\_HW} function in + \texttt{src/northbridge/amd/amdmct/mct\_ddr3/mcthwl.c}.} + \label{lst:write_level_second_pass} + \end{listing} + + \subsubsection{Details on the write leveling implementation} + + \subsection{Write Leveling on AMD Fam15h G34 Processors with RDIMMs} + + Write leveling is a crucial process in memory initialization + for DDR3 systems, ensuring that the DQS signals are + correctly aligned with the clock signals during write + operations. This is particularly important in systems using + AMD Fam15h processors with G34 sockets and RDIMM. The + write leveling process is divided into three distinct + phases, each managed by a specific function: + \path{AgesaHwWlPhase1}, \path{AgesaHwWlPhase2}, and + \path{AgesaHwWlPhase3}. These phases work together to + fine-tune the timing delays (gross and fine) for each byte + lane, ensuring reliable data transmission. \\ + + The write leveling process begins by selecting the target + DIMM. This is accomplished by programming the + \path{TrDimmSel} register to ensure that the subsequent + operations apply to the correct DIMM. \\ + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +set_DCT_ADDR_Bits(pDCTData, dct, pDCTData->NodeId, FUN_DCT, + DRAM_ADD_DCT_PHY_CONTROL_REG, TrDimmSelStart, + TrDimmSelEnd, (u32)dimm); + \end{minted} + \end{adjustwidth} + \caption{Target DIMM selection for write leveling.} + \label{lst:target_dimm_selection} + \end{listing} + + In the case of x4 DIMMs, which are common in high-density + memory configurations, write leveling must be performed + separately for each nibble (4-bit group). The function + checks if x4 DIMMs are present and, if so, prepares to train + both nibbles. \\ + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +train_both_nibbles = 0; +if (pDCTstat->Dimmx4Present) + if (is_fam15h()) + train_both_nibbles = 1; + \end{minted} + \end{adjustwidth} + \caption{Handling of x4 DIMMs and nibble training.} + \label{lst:x4_dimm_handling} + \end{listing} + + The DIMMs are prepared for write leveling by issuing Mode + Register (MR) commands. These commands configure the DIMMs + to enter a state where write leveling can be performed. \\ + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +prepareDimms(pMCTstat, pDCTstat, dct, dimm, TRUE); + \end{minted} + \end{adjustwidth} + \caption{Preparing DIMMs for write leveling.} + \label{lst:prepare_dimms} + \end{listing} + + The \path{procConfig} function is called to configure the + processor's DDR PHY (Physical Layer) for write leveling. + This configuration includes setting initial seed values for + gross and fine delays, which are essential for the + subsequent timing adjustments. \\ + + \path{procConfig} generates initial seed values for gross + and fine delays. These seeds are calculated based on several + factors: + + \begin{itemize} + \item \textbf{Processor Type:} For Fam15h processors, + specific tables from the Fam15h BKDG \cite{BKDG} are + referenced to select appropriate seed values for + different package types (e.g., Socket G34, Socket + C32). + \item \textbf{DIMM Type:} The seed values are adjusted + based on whether the RDIMMs are registered or + load-reduced, with different base values used for + these configurations. + \item \textbf{Memory Clock Frequency:} The seeds are + further adjusted based on the current memory clock + frequency (\path{MemClkFreq}), ensuring that the + timing is correct for the operating speed of the + memory. + \end{itemize} + + The calculated seed values are then scaled to the minimum + supported memory frequency and stored in the + \path{WLSeedGrossDelay} and \path{WLSeedFineDelay} arrays + for each byte lane. \\ + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +Seed_Total = (int32_t) (((((int64_t) Seed_Total) * + fam15h_freq_tab[MemClkFreq] * 100) / (mctGet_NVbits(NV_MIN_MEMCLK) * 100))); + +Seed_Gross = (Seed_Total >> 5) & 0x1f; +Seed_Fine = Seed_Total & 0x1f; + \end{minted} + \end{adjustwidth} + \caption{Seed generation in \texttt{procConfig}.} + \label{lst:seed_generation} + \end{listing} + + Write leveling is initiated by enabling the + \path{WrtLvTrEn} bit. This allows the DDR PHY to begin + adjusting the DQS signals relative to the clock signals. \\ + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +set_DCT_ADDR_Bits(pDCTData, dct, pDCTData->NodeId, FUN_DCT, + DRAM_ADD_DCT_PHY_CONTROL_REG, WrtLvTrEn, WrtLvTrEn, 1); + \end{minted} + \end{adjustwidth} + \caption{Initiating write leveling training.} + \label{lst:initiate_write_leveling} + \end{listing} + + After a delay to allow the leveling process to stabilize, + the function reads the gross and fine delay values from the + relevant registers and stores them. These values represent + the initial timing adjustments necessary for correct DQS + alignment. \\ + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +for (ByteLane = 0; ByteLane < lane_count; ByteLane++) { + getWLByteDelay(pDCTstat, dct, ByteLane, dimm, pass, nibble, lane_count); +} + \end{minted} + \end{adjustwidth} + \caption{Reading and storing delay values after write leveling.} + \label{lst:finalize_write_leveling} + \end{listing} + + If the DIMM is not x4, the function skips the nibble + training loop, as it is unnecessary. \\ + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +if ((pDCTstat->Dimmx4Present & (1 << (dimm + dct))) == 0) + break; + \end{minted} + \end{adjustwidth} + \caption{Exit for non-x4 DIMMs.} + \label{lst:exit_non_x4} + \end{listing} + + \subsubsection{Details on the DQS position training function} + + The DQS position training is a crucial step in the memory + initialization process, ensuring that both read and write + operations are correctly aligned with the clock signal. \\ + + The function \path{TrainDQSRdWrPos_D_Fam15} orchestrates this + process by iterating over memory lanes and adjusting timing + parameters to find optimal settings. It is called by + \path{mct_TrainDQSPos_D}. \\ + + The function begins by initializing several variables and + settings necessary for the training process. These include: + + \begin{itemize} + \item \texttt{Errors}: A variable to track any errors encountered during the training. + \item \texttt{dual\_rank}: A flag to indicate whether the current DIMM has two ranks. + \item \texttt{passing\_dqs\_delay\_found}: An array to track whether a passing DQS delay has been found for each lane. + \item \texttt{dqs\_results\_array}: A multi-dimensional array to store the results of the DQS delay tests across different write and read steps. + \end{itemize} + + The function then loops over each receiver (loosely associated + with chip selects) to perform the training for each rank within + each DIMM. \\ + \begin{listing} \begin{adjustwidth}{0.5cm}{0.5cm} \begin{minted}[linenos]{c} -if (is_fam15h()) { - struct DCTStatStruc *pDCTstat; - - for (Node = 0; Node < MAX_NODES_SUPPORTED; Node++) { - pDCTstat = pDCTstatA + Node; - if (pDCTstat->NodePresent) { - fam15h_switch_dct(pDCTstat->dev_map, 0); - } +for (Receiver = receiver_start; Receiver < receiver_end; Receiver++) { + dimm = (Receiver >> 1); + ... + if (!mct_RcvrRankEnabled_D(pMCTstat, pDCTstat, dct, Receiver)) { + continue; } -} - -/* FIXME - currently uses calculated value TrainMaxReadLatency_D(pMCTstat, pDCTstatA); */ -mctHookAfterAnyTraining(); \end{minted} \end{adjustwidth} - \caption{Post-training cleanup and final hook execution} - \label{lst:post_training_cleanup} + \caption{Initialization of variables and looping over each receiver.} + \label{lst:dqs_train_init} \end{listing} + For each lane in the memory channel, the function iterates over + possible write and read delay values to find the optimal + configuration. This is done by: + + \begin{enumerate} + \item Iterating over the write data delay values from the initial value to the initial value plus 1 UI (Unit Interval). + \item For each write data delay, iterating over possible read DQS delay values from 0 to 1 UI. + \item For each combination of write and read delays, testing the configuration by writing a training pattern to the memory and reading it back to check if it passes or fails. + \end{enumerate} + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +for (current_write_data_delay[lane] = initial_write_dqs_delay[lane]; + current_write_data_delay[lane] < (initial_write_dqs_delay[lane] + 0x20); + current_write_data_delay[lane]++) { + ... + for (current_read_dqs_delay[lane] = 0; + current_read_dqs_delay[lane] < 0x20; + current_read_dqs_delay[lane]++) { + ... + write_dqs_read_data_timing_registers(current_read_dqs_delay, dev, dct, dimm, index_reg); + read_dram_dqs_training_pattern_fam15(pMCTstat, pDCTstat, dct, Receiver, lane, ((check_antiphase == 0)?1:0)); + ... + } +} + \end{minted} + \end{adjustwidth} + \caption{Iteration over write and read delay values for each lane.} + \label{lst:dqs_train_iteration} + \end{listing} + + During each iteration, the results are recorded in the + \path{dqs_results_array}, which tracks whether the combination + of write and read delays was successful (pass) or not (fail). + The results are stored for both the primary rank and, if + applicable, the secondary rank when dual rank DIMMs are used. + \\ + + After iterating over all possible delay values, the function + processes the results to determine the best DQS delay settings. + \\ + + This is done by: + + \begin{itemize} + \item Finding the longest consecutive string of passing values for both read and write operations. + \item Calculating the center of the passing region and using this as the optimal delay setting. + \item If the center of the region is below a threshold, issuing a warning that a negative DQS recovery delay was detected, which could lead to instability. + \end{itemize} + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +if (best_count > 2) { + uint16_t region_center = (best_pos + (best_count / 2)); + if (region_center < 16) { + printk(BIOS_WARNING, "TrainDQSRdWrPos: negative DQS recovery delay detected!"); + region_center = 0; + } else { + region_center -= 16; + } + ... + current_read_dqs_delay[lane] = region_center; + passing_dqs_delay_found[lane] = 1; + write_dqs_read_data_timing_registers(current_read_dqs_delay, dev, dct, dimm, index_reg); +} + \end{minted} + \end{adjustwidth} + \caption{Processing the results to determine the best DQS delay settings.} + \label{lst:dqs_train_results} + \end{listing} + + Finally, the function checks if any lane did not find a valid + passing region. If any lanes failed to find a passing DQS delay, + the \path{Errors} flag is set, and this error is propagated + through the \path{pDCTstat->TrainErrors} and + \path{pDCTstat->ErrStatus} variables. + \\ + + The function returns \path{1} if no errors were encountered, + and \texttt{0} otherwise, which is unusual. \\ + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +for (lane = lane_start; lane < lane_end; lane++) { + if (!passing_dqs_delay_found[lane]) { + Errors |= 1 << SB_NODQSPOS; + } +} +pDCTstat->TrainErrors |= Errors; +pDCTstat->ErrStatus |= Errors; +return !Errors; + \end{minted} + \end{adjustwidth} + \caption{Final error handling and return value.} + \label{lst:dqs_train_finalize} + \end{listing} + + The DQS position training algorithm implemented in the + \path{TrainDQSRdWrPos_D_Fam15} function systematically explores + the possible delay settings for reading and writing operations + in the memory system. By iterating over a range of values, the + function identifies the optimal delays that result in reliable + data transfer. The results are carefully processed to ensure + that the best possible settings are applied, with checks and + balances in place to handle edge cases and potential errors. + \\ + \subsubsection{Details on the DQS receiver training function} - TODO study \path{TrainReceiverEn_D} \\ + In AMD Fam15h G34 processors, the DQS receiver enable training + is a critical step in ensuring that the memory subsystem operates + correctly and reliably. This training aligns the DQS signal with + the clock signal, ensuring proper data capture during memory reads. + \\ - \subsubsection{Details on the DQS position training function} + The DQS receiver enable training algorithm is executed twice: + first at the lowest supported MEMCLK frequency and then at the + highest supported MEMCLK frequency. The purpose of this training + is to fine-tune the timing parameters so that the memory + controller can reliably read data from the memory modules. + The algorithm is implemented in the function + \path{dqsTrainRcvrEn_SW_Fam15} from + \path{src/northbridge/.../mctsrc.c}, which orchestrates the + entire process, called by the \path{mct_TrainRcvrEn_D} function, + which has been called itself by \path{TrainReceiverEn_D} from + \path{src/northbridge/.../mctdqs_d.c}. \\ - TODO study \path{mct_TrainDQSPos_D} \\ + Here, seeds are initial delay values used to set + up the memory controller's timing parameters. These seeds are + generated based on the specific characteristics of the memory + configuration, such as the package type (e.g., G34, C32), the + type of DIMMs installed (Registered, Load Reduced, etc.), and + the maximum number of DIMMs that can be installed in a channel. + \\ - \subsection{Potential enhancements [WIP]} - \begin{itemize} - \item Identifying areas for improvement in the current - implementation - \item Potential enhancements to memory training algorithms - and configuration settings - \item Broader applicability of these improvements to other - systems using \textit{coreboot} - \end{itemize} + The seed generation is handled by the function + \path{fam15_receiver_enable_training_seed}. This function + generates a base seed value for each memory channel, based on + predefined tables in the BKDG \cite{BKDG}. The base seed values + are specific to the memory configuration and are adjusted based + on the type of DIMM and the number of DIMMs in each channel. \\ - FIXME (lst. \ref{lst:mctAutoInitMCT_D_fixme}) \\ + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +uint8_t MaxDimmsInstallable = mctGet_NVbits(NV_MAX_DIMMS_PER_CH); - It seems that that seeds for used for DQS training should be - extensively determined for each motherboard, and the BKDG - \cite{BKDG} does not tell otherwise. Moreover, seeds can be - configured uniquely for every possible socket, channel, DIMM module, - and even byte lane combination. The current implementation of - \path{DQSTiming_D} code is only using the recommended seeds from - the table 99 of the BKDG \cite{BKDG}, which is not sufficient - and absolutely not adapted to every DIMM module in the market. \\ +if (pDCTstat->Status & (1 << SB_Registered)) { + if (package_type == PT_GR) { + // Socket G34: Fam15h BKDG v3.14 Table 99 + if (MaxDimmsInstallable == 1) { + if (channel == 0) + seed = 0x43; + else if (channel == 1) + seed = 0x3f; + else if (channel == 2) + seed = 0x3a; + else if (channel == 3) + seed = 0x35; + } + ... + } + ... +} else if (pDCTstat->Status & (1 << SB_LoadReduced)) { + // Load Reduced DIMM configuration + if (package_type == PT_GR) { + // Socket G34: Fam15h BKDG v3.14 Table 99 + if (MaxDimmsInstallable == 1) { + if (channel == 0) + seed = 0x123; + ... + } + } +} + \end{minted} + \end{adjustwidth} + \caption{Seed generation for DQS receiver enable training based on DIMM type and configuration.} + \label{lst:seed_generation} + \end{listing} + + The generated seed values are then adjusted based on the + operating frequency of the memory (MEMCLK). The adjustment + scales the seed values to account for the difference between + the current memory frequency and the minimum supported + frequency. This ensures that the training can be accurately + performed across different operating conditions. \\ + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +initial_seed = (uint16_t) (((((uint64_t) initial_seed) * + fam15h_freq_tab[mem_clk] * 100) / (min_mem_clk * 100))); + \end{minted} + \end{adjustwidth} + \caption{Adjusting the seed values based on the operating frequency of the memory.} + \label{lst:seed_adjustment} + \end{listing} + + Once the seeds are generated and adjusted, they are used to set + the initial delay values for the DQS receiver enable training. + The delay values are split into two components: gross delay and + fine delay. The gross delay determines the overall timing + offset, while the fine delay adjusts the timing with finer + granularity. \\ + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +for (lane = 0; lane < lane_count; lane++) { + seed_gross[lane] = (seed[lane] >> 5) & 0x1f; + seed_fine[lane] = seed[lane] & 0x1f; + + if (seed_gross[lane] & 0x1) + seed_pre_gross[lane] = 1; + else + seed_pre_gross[lane] = 2; + + // Set the gross delay + current_total_delay[lane] = ((seed_gross[lane] & 0x1f) << 5); +} + \end{minted} + \end{adjustwidth} + \caption{Setting initial delay values based on the generated seed values.} + \label{lst:initial_delay_values} + \end{listing} + + These delay values are then written to the appropriate registers + to configure the memory controller for the DQS receiver enable + training. The training is performed in multiple steps, + iteratively refining the delay values until the DQS signal is + correctly aligned with the clock signal. \\ + + During the initialization phase, the memory controller is + prepared for training. This includes enabling the training mode, + configuring the memory channels, and disabling certain features + such as ECC (Error-Correcting Code) to prevent interference + during training. \\ + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +fam15EnableTrainingMode(pMCTstat, pDCTstat, ch, 1); +_DisableDramECC = mct_DisableDimmEccEn_D(pMCTstat, pDCTstat); + \end{minted} + \end{adjustwidth} + \caption{Initialization phase: Enabling training mode and disabling ECC.} + \label{lst:initialization_phase} + \end{listing} + + The training phase is where the actual alignment of the DQS + signal occurs. The memory controller iterates over each DIMM and + each lane, applying the seed values and adjusting the delay + registers accordingly. For each DIMM, the training is performed + twice: once for the first nibble (lower 4 bits) and once for + the second nibble (upper 4 bits) if the DIMM is x4. \\ + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +for (rank = 0; rank < (_2Ranks + 1); rank++) { + for (nibble = 0; nibble < (train_both_nibbles + 1); nibble++) { + ... + write_dqs_receiver_enable_control_registers(current_total_delay, dev, Channel, dimm, index_reg); + ... + } +} + \end{minted} + \end{adjustwidth} + \caption{Training phase: Iterating over ranks and nibbles to apply delay values.} + \label{lst:training_phase} + \end{listing} + + During the training, the controller issues read requests to the + memory to observe the timing of the DQS signal. The observed + delays are then averaged and adjusted to ensure the DQS signal + is correctly aligned across all lanes and ranks. \\ + + In the finalization phase, the memory controller exits the + training mode, and the computed delay values are written back to + the appropriate registers. This ensures that the DQS signal + remains correctly aligned during normal operation. \\ + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +Calc_SetMaxRdLatency_D_Fam15(pMCTstat, pDCTstat, 0, 0); +Calc_SetMaxRdLatency_D_Fam15(pMCTstat, pDCTstat, 1, 0); +if (Pass == FirstPass) { + mct_DisableDQSRcvEn_D(pDCTstat); +} + \end{minted} + \end{adjustwidth} + \caption{Finalization phase: Exiting training mode and setting read latency.} + \label{lst:finalization_phase} + \end{listing} + + \subsection{Potential enhancements} + + \subsubsection{DQS receiver training} + + While the DQS receiver enable training implementation for AMD + Fam15h G34 processors can perform its intended function in some + cases, there are several areas where the code is either + incomplete, suboptimal, or potentially problematic. \\ + + The presence of \path{TODO} comments in the code indicates areas + where the implementation is either incomplete or lacks certain + necessary functionality. These unaddressed tasks can lead to + performance issues, potential bugs, or incomplete training, + which could compromise the stability and reliability of the + memory subsystem. \\ + + In the seed adjustment section for the second pass of training, + the code includes a \path{TODO} comment regarding fetching the + correct value from \path{RC2[0]} for the \path{addr_prelaunch} + variable: + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +uint8_t addr_prelaunch = 0; /* TODO: Fetch the correct value from RC2[0] */ + \end{minted} + \end{adjustwidth} + \caption{\texttt{TODO} comment indicating an unimplemented feature in the seed adjustment logic.} + \label{lst:todo_rc2} + \end{listing} + + This unimplemented feature suggests that the training process + may not be fully optimized, as the correct prelaunch address + setting is not being applied. This could result in incorrect + seed values being used during the training, leading to + suboptimal alignment of the DQS signal. \\ + + The code contains another \path{TODO} comment indicating that + the support for Load Reduced DIMMs (LRDIMMs) is unimplemented: + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +else if ((pDCTstat->Status & (1 << SB_LoadReduced))) { + /* TODO + * Load reduced DIMM support unimplemented + */ + register_delay = 0x0; +} + \end{minted} + \end{adjustwidth} + \caption{\texttt{TODO} comment indicating that LRDIMM support is unimplemented.} + \label{lst:todo_lrdimm} + \end{listing} + + This omission is significant because LRDIMMs are commonly used + in server environments where high memory capacity is required. + The lack of support for LRDIMMs could lead to incorrect training + or even failures when such DIMMs are installed, severely + impacting the reliability of the system. \\ + + \path{FIXME} comments in the code are often indicators of known + issues or temporary workarounds that need to be addressed. In + this implementation, there are several such comments that + highlight critical areas where the current approach may be + flawed or incomplete. \\ + + The first \path{FIXME} comment questions the usage of the + \path{SSEDIS} setting during the training process: + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +lo &= ~(1 << 15); /* SSEDIS */ +_WRMSR(msr, lo, hi); /* Setting wrap32dis allows 64-bit memory references in real mode */ + \end{minted} + \end{adjustwidth} + \caption{\texttt{FIXME} comment questioning the use of \texttt{SSEDIS} in the MSR setting.} + \label{lst:fixme_ssedis} + \end{listing} + + The concern here is that disabling the \path{SSEDIS} + (SSE Disable) bit could have unintended side effects, + particularly in environments where SSE instructions are + expected to be enabled. This could impact the performance of + the system during training and potentially lead to instability. + \\ + + The code also highlights a potential misprint in the BKDG + regarding the \path{WrDqDqsEarly} value: + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +/* NOTE: While the BKDG states to only program DqsRcvEnGrossDelay, this appears + * to have been a misprint as DqsRcvEnFineDelay should be set to zero as well. + */ + \end{minted} + \end{adjustwidth} + \caption{\texttt{FIXME} comment questioning a possible misprint in the BKDG regarding delay settings.} + \label{lst:fixme_misprint} + \end{listing} + + This indicates that the implementation may be based on incorrect + or incomplete documentation, leading to potential errors in + setting the delay values. If this is indeed a misprint in the + BKDG, the correction should be verified with updated + documentation, and the implementation should be adjusted + accordingly. \\ + + In addition to the explicit \path{TODO} and \path{FIXME} + comments, there are other aspects of the implementation that + could impact performance and stability. \\ + + The logic for adjusting the seed values based on the memory + frequency and the platform's minimum supported frequency is + complex and prone to errors, especially when combined with the + incomplete \path{TODO} features. The risk here is that incorrect + seed values could be used, leading to timing mismatches during + the training process. It seems that that seeds for used for DQS + training should be extensively determined for each motherboard, + and the BKDG \cite{BKDG} does not tell otherwise. Moreover, + seeds can be configured uniquely for every possible socket, + channel, DIMM module, and even byte lane combination. The current + implementation is here only using the recommended seeds from + the table 99 of the BKDG \cite{BKDG}, which is not sufficient + and absolutely not adapted to every DIMM module in the market. + \\ + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +if (pDCTstat->Status & (1 << SB_Registered)) { + if (package_type == PT_GR) { + /* Socket G34: Fam15h BKDG v3.14 Table 99 */ + if (MaxDimmsInstallable == 1) { + if (channel == 0) + seed = 0x43; + else if (channel == 1) + seed = 0x3f; + else if (channel == 2) + seed = 0x3a; + else if (channel == 3) + seed = 0x35; + } else if (MaxDimmsInstallable == 2) { + if (channel == 0) + seed = 0x54; + else if (channel == 1) + seed = 0x4d; + else if (channel == 2) + seed = 0x45; + else if (channel == 3) + seed = 0x40; + } else if (MaxDimmsInstallable == 3) { + if (channel == 0) + seed = 0x6b; + else if (channel == 1) + seed = 0x5e; + else if (channel == 2) + seed = 0x4b; + else if (channel == 3) + seed = 0x3d; + } + \end{minted} + \end{adjustwidth} + \caption{Seeds used for DQS Receiver training.} + \label{lst:dqs_receiver_training_seeds} + \end{listing} + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +initial_seed = (uint16_t) (((((uint64_t) initial_seed) * + fam15h_freq_tab[mem_clk] * 100) / (min_mem_clk * 100))); + \end{minted} + \end{adjustwidth} + \caption{Complex seed adjustment logic that could lead to timing mismatches.} + \label{lst:seed_adjustment_logic} + \end{listing} + + The current implementation also has limited error handling and + reporting. While some errors are detected during training, the + code does not have robust mechanisms for recovering from or + correcting these errors. \\ + + This approach might lead to further complications in high-load + scenarios or when the memory configuration changes, as the + underlying issues are not resolved. \\ + + \subsubsection{Write leveling} + + While the current implementation of write leveling on AMD Fam15h + G34 processors with RDIMMs can be functional in some cases and + provides the necessary steps to align DQS signals correctly + during write operations, there are several areas where the + implementation is either incomplete, relies on temporary + workarounds, or may introduce stability and performance issues. + \\ + + One of the most significant concerns with the current + implementation is the presence of unresolved \path{TODO} and + \path{FIXME} comments throughout the code. These comments + indicate areas where the implementation is either incomplete or + has known issues that have not been fully resolved. \\ + + In the \path{procConfig} function, a \path{TODO} comment + mentions that the current implementation may not be using + the correct or final value for this variable, potentially + leading to inaccuracies in the seed values used during write + leveling. This inaccuracy can result in timing mismatches, which + may cause data corruption or other stability issues. \\ + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +uint8_t AddrCmdPrelaunch = 0; /* TODO: Fetch the correct value from RC2[0] */ + \end{minted} + \end{adjustwidth} + \caption{\texttt{TODO} indicating incomplete seed generation implementation.} + \label{lst:todo_seed_generation} + \end{listing} + + In \path{AgesaHwWlPhase2}, there is a \path{FIXME} comment that + suggests that the Critical Gross Delay adjustment has been + temporarily disabled due to conflicts with RDIMM training. + Disabling this adjustment can lead to less precise DQS alignment, + especially in complex memory configurations like those using + RDIMMs, potentially causing instability or degraded performance. + \\ + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +/* FIXME: For now, disable CGD adjustment as it seems to interfere with registered DIMM training */ + \end{minted} + \end{adjustwidth} + \caption{\texttt{FIXME} indicating disabled CGD adjustment due to conflicts.} + \label{lst:fixme_cgd_adjustment} + \end{listing} + + Another \path{FIXME} in the code indicates that the + \path{WrDqDqsEarly} parameter, which is critical for fine-tuning + the DQS signal’s timing during write operations, is being + ignored due to unresolved issues. This omission can result in + less accurate timing adjustments, leading to potential marginal + instability in systems where tight timing margins are critical. + \\ + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +/* FIXME: Ignore WrDqDqsEarly for now to work around training issues */ + \end{minted} + \end{adjustwidth} + \caption{\texttt{FIXME} indicating the omission of \texttt{WrDqDqsEarly} parameter.} + \label{lst:fixme_wrdqdqs_early} + \end{listing} + + In \path{AgesaHwWlPhase2}, the function bypasses certain + critical adjustments if the memory speed is being tuned (e.g., + during frequency stepping). This bypass is noted as a temporary + measure due to problems encountered during testing, where the + first pass values were found to cause issues with PHY training + on all Family 15h processors tested. This approach indicates a + lack of robustness in the implementation, particularly in + handling dynamic changes in memory frequency, which is essential + for server environments where performance tuning is common. \\ + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +/* FIXME: Using the Pass 1 training values causes major phy training problems on + * all Family 15h processors I tested (Pass 1 values are randomly too high, + * and Pass 2 cannot lock). Figure out why this is and fix it, then remove the bypass code below... */ + \end{minted} + \end{adjustwidth} + \caption{\texttt{FIXME} indicating the bypass of critical adjustments during speed tuning.} + \label{lst:fixme_bypass_critical_adjustments} + \end{listing} + + The current implementation attempts to compensate for noise and + instability by overriding faulty values with seed values in + \path{AgesaHwWlPhase2}. However, this approach is somewhat blunt + and reactive, addressing the symptoms rather than the underlying + causes of instability. This method does not ensure that noise or + instability is sufficiently mitigated, potentially leading to + marginal or sporadic failures during normal operation. \\ + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +if (faulty_value_detected) { + pDCTData->WLGrossDelay[index+ByteLane] = pDCTData->WLSeedGrossDelay[index+ByteLane]; + pDCTData->WLFineDelay[index+ByteLane] = pDCTData->WLSeedFineDelay[index+ByteLane]; + status = 1; +} + \end{minted} + \end{adjustwidth} + \caption{Reactive error handling to compensate for noise and instability.} + \label{lst:reactive_error_handling} + \end{listing} + + The current implementation uses generic or "stock" seed values + for certain configurations, such as Socket G34. Without + mainboard-specific overrides, the memory initialization process + might not be fully optimized for the particular motherboard in + use. This could result in suboptimal performance or stability + issues in specific environments, particularly in server + applications where memory performance is critical. \\ + + \begin{listing} + \begin{adjustwidth}{0.5cm}{0.5cm} + \begin{minted}[linenos]{c} +/* FIXME: Implement mainboard-specific seed and WrDqsGrossDly base overrides. + * 0x41 and 0x0 are the "stock" values */ + \end{minted} + \end{adjustwidth} + \caption{\texttt{FIXME} indicating the need for mainboard-specific seed overrides.} + \label{lst:fixme_mainboard_specific_overrides} + \end{listing} + + The handling of x4 DIMMs, with separate training for each nibble, + introduces additional complexity. While necessary for these + configurations, the logic is fragmented, with several points + where the function branches based on whether the DIMM is x4. + This complexity increases the risk of bugs or missed conditions, + particularly if future changes or enhancements are made to the + code. The overcomplicated logic can also make the code more + difficult to maintain and extend. \\ + + \subsection{DQS position training} + + While the DQS position training algorithm implemented in the + \path{TrainDQSRdWrPos_D_Fam15} function may work in some + cased to ensure optimal data strobe alignment, there are + several critical flaws and issues within the implementation + that could impact its effectiveness and reliability. \\ + + Throughout the function, there is an overreliance on hardcoded + constants and magic numbers, such as: + + \begin{itemize} + \item The use of \texttt{0x20} to represent 1 UI (Unit Interval) in multiple places. + \item The constant \texttt{16} used in the adjustment of \texttt{region\_center} during the processing of results. + \item Magic numbers like \texttt{32} and \texttt{48} in the array dimensions for \texttt{dqs\_results\_array}. + \end{itemize} + + These values should be replaced with named constants or + variables that clearly indicate their purpose, improving code + readability and maintainability. Additionally, using + well-defined constants would allow easier adjustments if the + algorithm needs to be adapted for different hardware + configurations or future revisions of the architecture. \\ + + The error handling within the function is rudimentary, with + errors being flagged primarily by setting bits in the + \texttt{Errors} variable. However, the function does not + provide detailed diagnostics or recovery strategies when an + error occurs. For example: + + \begin{itemize} + \item If no passing DQS delay is found for a lane, the + function simply sets an error bit without attempting any + corrective actions or providing detailed information on + what went wrong. + \item The early abort mechanism based on the value read from + the \texttt{0x264} register does not offer a robust + fallback or retry mechanism, which could lead to + situations where minor, recoverable issues cause the + entire training process to fail. + \end{itemize} + + Improving the error handling to include detailed diagnostics, + logging, and potentially corrective actions (such as retrying + the training with adjusted parameters) would make the function + more resilient and reliable. \\ + + The function contains several areas where the logic is more + complex than necessary, which can lead to difficulties in + understanding and maintaining the code. Examples include: + + \begin{itemize} + \item The nested loops for iterating over write and read + delays are deeply nested, making it challenging to + follow the flow of the code and understand the + interactions between different parts of the algorithm. + \item The use of multiple copies of delay settings (e.g., + \texttt{current\_write\_data\_delay}, + \texttt{initial\_write\_data\_timing}, and + \texttt{initial\_write\_dqs\_delay}) introduces + redundancy and increases the likelihood of errors + or inconsistencies. + \end{itemize} + + Refactoring the code to simplify the logic, reduce redundancy, + and make the flow of operations clearer would improve both the + readability and reliability of the implementation. \\ + + The current implementation does not adequately handle edge cases + and boundary conditions, such as: + + \begin{itemize} + \item The warning issued when a negative DQS recovery delay + is detected suggests that the function continues despite + recognizing a potentially critical issue, which could + lead to system instability. + \item The averaging of delay values for dual-rank DIMMs does + not account for the possibility of significant + discrepancies between the ranks, which could result in + suboptimal or unstable settings. + \item The function does not include comprehensive checks for + situations where the calculated delay settings might + exceed hardware limitations or cause timing violations. + \end{itemize} + + Improving the handling of edge cases and boundary conditions, + possibly by incorporating more robust validation checks and + conservative fallback mechanisms, would make the algorithm more + reliable in a wider range of scenarios. \\ + + The code contains several \texttt{TODO} and \texttt{FIXME} + comments that indicate incomplete or problematic parts of + the implementation: + + \begin{itemize} + \item The comment \texttt{TODO: Fetch the correct value + from RC2[0]} suggests that critical configuration values + are not correctly initialized, which could compromise + the entire training process. + \item The \texttt{FIXME} comments related to early abort + checks and DQS recovery delay calculations indicate that + there are known issues with the current approach that + have not been resolved, potentially leading to incorrect + or unstable results. + \item The handling of antiphase results, particularly with + respect to checking for early aborts, is incomplete and + could lead to situations where incorrect results are + accepted without proper validation. + \end{itemize} + + The current implementation's approach to iterating over every + possible combination of write and read delays is exhaustive but + may be inefficient. The function performs multiple reads and + writes to hardware registers for every iteration, which could + be time-consuming, especially on systems with a large number + of lanes or complex memory configurations. \\ + + Consideration should be given to optimizing the algorithm, + possibly by narrowing the search space based on prior knowledge + or implementing more efficient search techniques, to reduce + the time required for DQS position training without compromising + accuracy. \\ + + \subsection{On a wider scale...} + + \subsubsection{Saving training values in NVRAM} + + The function \path{mctAutoInitMCT_D} is responsible for + automatically initializing the memory controller training (MCT) + process, which involves configuring various memory parameters + and performing training routines to ensure stable and efficient + memory operation. However, the fact that + \path{mctAutoInitMCT\_D} does not allow for the restoration of + training data from NVRAM (lst. \ref{lst:mctAutoInitMCT_D_fixme}) + poses several significant problems. \\ + + Memory training is a time-consuming process that involves + multiple iterations of read/write operations, delay adjustments, + and calibration steps. By not restoring previously saved + training data from NVRAM, the system is forced to re-run the + full training sequence every time it boots up. This leads to + longer boot times, which can be particularly problematic in + environments where quick system restarts are critical, such + as in servers or embedded systems. \\ + + Each time memory training is performed, it puts additional + stress on the memory modules and the memory controller. + Repeatedly executing the training process at every boot can + contribute to the wear and tear of hardware components, + potentially reducing their lifespan. This issue is especially + concerning in systems that frequently power cycle or reboot. \\ + + Memory training is sensitive to various factors, such as + temperature, voltage, and load conditions. As a result, the + training results can vary slightly between different boot + cycles. Without the ability to restore previously validated + training data, there is a risk of inconsistency in memory + performance across reboots. This could lead to instability + or suboptimal memory operation, affecting the overall + performance of the system. \\ + + If the memory training process fails during boot, the system + may be unable to operate properly or may fail to boot entirely. + By restoring validated training data from NVRAM, the system + can bypass the training process altogether, reducing the risk + of boot failures caused by training issues. Without this + feature, any minor issue that affects training could result + in system downtime. \\ + + Finally, modern memory controllers often include power-saving + features that are fine-tuned during the training process. By + reusing validated training data from NVRAM, the system can + quickly return to an optimized state with lower power + consumption. + The inability to restore this data forces the system to + operate at a potentially less efficient state until training + is complete, leading to higher power consumption during the + boot process. \\ + + \subsubsection{A seedless DQS position training algorithm} + + An algorithm to find the best timing for the DQS so that the + memory controller can reliably read data from the memory + could be done without relying on any pre-known starting + values (seeds). This would allow for better reliability and + wider support for different situations. The algorithm + could be describe as follows. \\ + + \begin{itemize} + \item Prepare Memory Controller: + The memory controller needs to be in a state where it can + safely adjust the DQS timing without affecting the normal + operation of the system. By blocking the DQS signal locking, + we ensure that the adjustments made during training do not + interfere with the controller’s ability to capture data + until the optimal settings are found. + + \item Initialize Variables: + Set up variables to store the various timing settings and + test results for each bytelane. This setup is crucial + because each bytelane might require a different optimal + timing, and keeping track of these values ensures that the + algorithm can correctly determine the best delay settings + later. + \end{itemize} + + The main loop is the core of the algorithm, where different + timing settings are systematically explored. By looping + through possible delay settings, the algorithm ensures + that it doesn't miss any potential optimal timings. The + loop structure allows a methodical test of a range of + delays to find the most reliable one. \\ + + The gross delay is here the coarse adjustment to the timing + of the DQS signal. It shifts the timing window by a large + amount, helping to broadly align the DQS with the data + lines (DQ). The fine delay, which is the smaller, more + precise change to the timing of the DQS signal once the + coarse alignment (through gross delay) has been achieved, + would then be computed. \\ + + To compute a delay, here would be the steps: + + \begin{itemize} + \item Set a delay: + Setting an initial delay allows the algorithm to start + testing. The initial delay might be zero or another default + value, providing a baseline from which to begin the search + for the optimal timing. + + \item Test it: + After setting the delay, it is essential to test whether the + memory controller can read data correctly. This step is + critical because it indicates whether the current delay + setting is within the acceptable range for reliable data + capture. + + \item Check the result: + If the memory controller successfully reads data, it means + the current delay setting is valid. This information is + crucial because it helps define the range of acceptable + timings. If the test fails, it indicates that the curren + t delay setting is outside the range where the memory + controller can reliably capture data. + + \item Increase/decrease delay: + By incrementally adjusting the delay, either increasing or + decreasing, the algorithm can explore different timing + settings in a controlled manner. This ensures that the + entire range of possible delays is covered without skipping + over any potential good delays. + + \item Test again: + Re-testing after each adjustment ensures that the exact + point where the DQS timing goes from acceptable (pass) to + unacceptable (fail) is caught. This step helps in + identifying the transition point, which is often the optimal + place to set the DQS delay. + + \item Look for a transition: + The transition from pass to fail is where the DQS timing + crosses the boundary of the valid timing window. This + transition is crucial because it marks the end of the + reliable range. The best timing is usually just before + this transition. + + \item Record the best setting: + Storing the best delay setting for each bytelane ensures + that a reliable timing configuration is available when the + training is complete. + + \item Confirm all bytelanes: + Before finalizing the settings, it is important to ensure + that the chosen delays work for all bytelanes. This step + serves as a final safeguard against errors, ensuring that + every part of the data bus is correctly aligned. + \end{itemize} + + Each bytelane (8-bit segment of data) may require a + different optimal delay setting. By repeating the process + for all bytelanes, the algorithm ensures that the entire + data bus is correctly timed. Misalignment in even one + bytelane can lead to data errors, making it essential to + tune every bytelane individually. \\ + + Once the best settings are confirmed, they need to be + applied to the memory controller for use during normal + operation. This step locks in the most reliable timing + configuration found during the training process. \\ + + After the optimal settings are applied, it is necessary + to allow the DQS signal locking mechanism to resume. This + locks in the delay settings, ensuring stable operation going + forward. \\ + + Finally, the algorithm needs to indicate whether it was + successful in finding reliable timing settings for all + bytelanes. This feedback is crucial for determining whether + the memory system is correctly configured or if further + adjustments or troubleshooting are needed. \\ - See \path{TrainDQSRdWrPos_D_Fam15} in - \path{src/northbridge/amd/amdmct/mct/mct_ddr3/mctdqs_d.c} : allowed - to have negative DQS ("Attempting to continue but your system may - be unstable"). This kind of value should be discarded and - calculation done again. \\ % ------------------------------------------------------------------------------ % CHAPTER 5: Virtualization of the operating system through firmware abstraction diff --git a/hardware_init_review.toc b/hardware_init_review.toc index 17d4b35..296aef9 100644 --- a/hardware_init_review.toc +++ b/hardware_init_review.toc @@ -3,53 +3,62 @@ \contentsline {chapter}{Abstract}{4}{chapter*.2}% \contentsline {chapter}{List of Figures}{7}{chapter*.2}% \contentsline {chapter}{List of Listings}{8}{chapter*.2}% -\contentsline {chapter}{\numberline {1}Introduction to firmware and BIOS evolution}{9}{chapter.1}% -\contentsline {section}{\numberline {1.1}Historical context of BIOS}{9}{section.1.1}% -\contentsline {subsection}{\numberline {1.1.1}Definition and origin}{9}{subsection.1.1.1}% -\contentsline {subsection}{\numberline {1.1.2}Functionalities and limitations}{10}{subsection.1.1.2}% -\contentsline {section}{\numberline {1.2}Modern BIOS and UEFI}{11}{section.1.2}% -\contentsline {subsection}{\numberline {1.2.1}Transition from traditional BIOS to UEFI (Unified Extensible Firmware Interface)}{11}{subsection.1.2.1}% -\contentsline {subsection}{\numberline {1.2.2}An other way with \textit {coreboot}}{11}{subsection.1.2.2}% -\contentsline {section}{\numberline {1.3}Shift in firmware responsibilities}{13}{section.1.3}% -\contentsline {chapter}{\numberline {2}Characteristics of ASUS KGPE-D16 mainboard}{14}{chapter.2}% -\contentsline {section}{\numberline {2.1}Overview of ASUS KGPE-D16 hardware}{15}{section.2.1}% -\contentsline {section}{\numberline {2.2}Chipset}{16}{section.2.2}% -\contentsline {section}{\numberline {2.3}Processors}{18}{section.2.3}% -\contentsline {section}{\numberline {2.4}Baseboard Management Controller}{19}{section.2.4}% -\contentsline {chapter}{\numberline {3}Key components in modern firmware}{21}{chapter.3}% -\contentsline {section}{\numberline {3.1}General structure of coreboot}{21}{section.3.1}% -\contentsline {subsection}{\numberline {3.1.1}Bootblock}{22}{subsection.3.1.1}% -\contentsline {subsection}{\numberline {3.1.2}Romstage}{24}{subsection.3.1.2}% -\contentsline {subsection}{\numberline {3.1.3}Ramstage}{25}{subsection.3.1.3}% -\contentsline {subsubsection}{\numberline {3.1.3.1}Advanced Configuration and Power Interface}{25}{subsubsection.3.1.3.1}% -\contentsline {subsubsection}{\numberline {3.1.3.2}System Management Mode}{26}{subsubsection.3.1.3.2}% -\contentsline {subsection}{\numberline {3.1.4}Payload}{26}{subsection.3.1.4}% -\contentsline {section}{\numberline {3.2}AMD Platform Security Processor and Intel Management Engine}{27}{section.3.2}% -\contentsline {chapter}{\numberline {4}Memory initialization and training}{29}{chapter.4}% -\contentsline {section}{\numberline {4.1}Importance of DDR3 Memory Initialization}{29}{section.4.1}% -\contentsline {subsection}{\numberline {4.1.1}General steps for DDR3 configuration}{30}{subsection.4.1.1}% -\contentsline {section}{\numberline {4.2}Memory initialization techniques}{33}{section.4.2}% -\contentsline {subsection}{\numberline {4.2.1}Memory training algorithms}{33}{subsection.4.2.1}% -\contentsline {subsection}{\numberline {4.2.2}BIOS and Kernel Developer Guide (BKDG) recommendations}{34}{subsection.4.2.2}% -\contentsline {subsubsection}{\numberline {4.2.2.1}DDR3 initialization procedure}{35}{subsubsection.4.2.2.1}% -\contentsline {subsubsection}{\numberline {4.2.2.2}ZQ calibration process}{35}{subsubsection.4.2.2.2}% -\contentsline {subsubsection}{\numberline {4.2.2.3}Write leveling process}{36}{subsubsection.4.2.2.3}% -\contentsline {section}{\numberline {4.3}Current implementation and potential improvements}{37}{section.4.3}% -\contentsline {subsection}{\numberline {4.3.1}Current implementation in coreboot on the KGPE-D16}{37}{subsection.4.3.1}% -\contentsline {subsubsection}{\numberline {4.3.1.1}Details on the DQS training function}{47}{subsubsection.4.3.1.1}% -\contentsline {subsubsection}{\numberline {4.3.1.2}Details on the DQS receiver training function}{48}{subsubsection.4.3.1.2}% -\contentsline {subsubsection}{\numberline {4.3.1.3}Details on the DQS position training function}{48}{subsubsection.4.3.1.3}% -\contentsline {subsection}{\numberline {4.3.2}Potential enhancements [WIP]}{48}{subsection.4.3.2}% -\contentsline {chapter}{\numberline {5}Virtualization of the operating system through firmware abstraction}{52}{chapter.5}% -\contentsline {section}{\numberline {5.1}ACPI and abstraction of hardware control}{52}{section.5.1}% -\contentsline {section}{\numberline {5.2}SMM as a hidden execution layer}{53}{section.5.2}% -\contentsline {section}{\numberline {5.3}UEFI and persistence}{53}{section.5.3}% -\contentsline {subsection}{\numberline {5.3.1}Memory Management}{54}{subsection.5.3.1}% -\contentsline {subsection}{\numberline {5.3.2}File System Management}{54}{subsection.5.3.2}% -\contentsline {subsection}{\numberline {5.3.3}Device Drivers}{54}{subsection.5.3.3}% -\contentsline {subsection}{\numberline {5.3.4}Power Management}{54}{subsection.5.3.4}% -\contentsline {section}{\numberline {5.4}Intel and AMD: control beyond the OS}{54}{section.5.4}% -\contentsline {section}{\numberline {5.5}The OS as a virtualized environment}{55}{section.5.5}% -\contentsline {chapter}{Conclusion}{56}{chapter*.4}% -\contentsline {chapter}{Bibliography}{57}{chapter*.4}% -\contentsline {chapter}{GNU Free Documentation License}{64}{chapter*.6}% +\contentsline {chapter}{\numberline {1}Introduction to firmware and BIOS evolution}{10}{chapter.1}% +\contentsline {section}{\numberline {1.1}Historical context of BIOS}{10}{section.1.1}% +\contentsline {subsection}{\numberline {1.1.1}Definition and origin}{10}{subsection.1.1.1}% +\contentsline {subsection}{\numberline {1.1.2}Functionalities and limitations}{11}{subsection.1.1.2}% +\contentsline {section}{\numberline {1.2}Modern BIOS and UEFI}{12}{section.1.2}% +\contentsline {subsection}{\numberline {1.2.1}Transition from traditional BIOS to UEFI (Unified Extensible Firmware Interface)}{12}{subsection.1.2.1}% +\contentsline {subsection}{\numberline {1.2.2}An other way with \textit {coreboot}}{12}{subsection.1.2.2}% +\contentsline {section}{\numberline {1.3}Shift in firmware responsibilities}{14}{section.1.3}% +\contentsline {chapter}{\numberline {2}Characteristics of ASUS KGPE-D16 mainboard}{15}{chapter.2}% +\contentsline {section}{\numberline {2.1}Overview of ASUS KGPE-D16 hardware}{16}{section.2.1}% +\contentsline {section}{\numberline {2.2}Chipset}{17}{section.2.2}% +\contentsline {section}{\numberline {2.3}Processors}{19}{section.2.3}% +\contentsline {section}{\numberline {2.4}Baseboard Management Controller}{20}{section.2.4}% +\contentsline {chapter}{\numberline {3}Key components in modern firmware}{22}{chapter.3}% +\contentsline {section}{\numberline {3.1}General structure of coreboot}{22}{section.3.1}% +\contentsline {subsection}{\numberline {3.1.1}Bootblock}{23}{subsection.3.1.1}% +\contentsline {subsection}{\numberline {3.1.2}Romstage}{25}{subsection.3.1.2}% +\contentsline {subsection}{\numberline {3.1.3}Ramstage}{26}{subsection.3.1.3}% +\contentsline {subsubsection}{\numberline {3.1.3.1}Advanced Configuration and Power Interface}{26}{subsubsection.3.1.3.1}% +\contentsline {subsubsection}{\numberline {3.1.3.2}System Management Mode}{27}{subsubsection.3.1.3.2}% +\contentsline {subsection}{\numberline {3.1.4}Payload}{27}{subsection.3.1.4}% +\contentsline {section}{\numberline {3.2}AMD Platform Security Processor and Intel Management Engine}{28}{section.3.2}% +\contentsline {chapter}{\numberline {4}Memory initialization and training}{30}{chapter.4}% +\contentsline {section}{\numberline {4.1}Importance of DDR3 Memory Initialization}{30}{section.4.1}% +\contentsline {subsection}{\numberline {4.1.1}General steps for DDR3 configuration}{31}{subsection.4.1.1}% +\contentsline {section}{\numberline {4.2}Memory initialization techniques}{34}{section.4.2}% +\contentsline {subsection}{\numberline {4.2.1}Memory training algorithms}{34}{subsection.4.2.1}% +\contentsline {subsection}{\numberline {4.2.2}BIOS and Kernel Developer Guide (BKDG) recommendations}{35}{subsection.4.2.2}% +\contentsline {subsubsection}{\numberline {4.2.2.1}DDR3 initialization procedure}{36}{subsubsection.4.2.2.1}% +\contentsline {subsubsection}{\numberline {4.2.2.2}ZQ calibration process}{36}{subsubsection.4.2.2.2}% +\contentsline {subsubsection}{\numberline {4.2.2.3}Write leveling process}{37}{subsubsection.4.2.2.3}% +\contentsline {section}{\numberline {4.3}Current implementation and potential improvements}{39}{section.4.3}% +\contentsline {subsection}{\numberline {4.3.1}Current implementation in coreboot on the KGPE-D16}{39}{subsection.4.3.1}% +\contentsline {subsubsection}{\numberline {4.3.1.1}Details on the DQS training function}{48}{subsubsection.4.3.1.1}% +\contentsline {subsubsection}{\numberline {4.3.1.2}Details on the write leveling implementation}{51}{subsubsection.4.3.1.2}% +\contentsline {subsubsection}{\numberline {4.3.1.3}Details on the write leveling implementation}{54}{subsubsection.4.3.1.3}% +\contentsline {subsection}{\numberline {4.3.2}Write Leveling on AMD Fam15h G34 Processors with RDIMMs}{54}{subsection.4.3.2}% +\contentsline {subsubsection}{\numberline {4.3.2.1}Details on the DQS position training function}{55}{subsubsection.4.3.2.1}% +\contentsline {subsubsection}{\numberline {4.3.2.2}Details on the DQS receiver training function}{57}{subsubsection.4.3.2.2}% +\contentsline {subsection}{\numberline {4.3.3}Potential enhancements}{60}{subsection.4.3.3}% +\contentsline {subsubsection}{\numberline {4.3.3.1}DQS receiver training}{60}{subsubsection.4.3.3.1}% +\contentsline {subsubsection}{\numberline {4.3.3.2}Write leveling}{61}{subsubsection.4.3.3.2}% +\contentsline {subsection}{\numberline {4.3.4}DQS position training}{63}{subsection.4.3.4}% +\contentsline {subsection}{\numberline {4.3.5}On a wider scale...}{65}{subsection.4.3.5}% +\contentsline {subsubsection}{\numberline {4.3.5.1}Saving training values in NVRAM}{65}{subsubsection.4.3.5.1}% +\contentsline {subsubsection}{\numberline {4.3.5.2}A seedless DQS position training algorithm}{66}{subsubsection.4.3.5.2}% +\contentsline {chapter}{\numberline {5}Virtualization of the operating system through firmware abstraction}{68}{chapter.5}% +\contentsline {section}{\numberline {5.1}ACPI and abstraction of hardware control}{68}{section.5.1}% +\contentsline {section}{\numberline {5.2}SMM as a hidden execution layer}{69}{section.5.2}% +\contentsline {section}{\numberline {5.3}UEFI and persistence}{69}{section.5.3}% +\contentsline {subsection}{\numberline {5.3.1}Memory Management}{70}{subsection.5.3.1}% +\contentsline {subsection}{\numberline {5.3.2}File System Management}{70}{subsection.5.3.2}% +\contentsline {subsection}{\numberline {5.3.3}Device Drivers}{70}{subsection.5.3.3}% +\contentsline {subsection}{\numberline {5.3.4}Power Management}{70}{subsection.5.3.4}% +\contentsline {section}{\numberline {5.4}Intel and AMD: control beyond the OS}{70}{section.5.4}% +\contentsline {section}{\numberline {5.5}The OS as a virtualized environment}{71}{section.5.5}% +\contentsline {chapter}{Conclusion}{72}{chapter*.4}% +\contentsline {chapter}{Bibliography}{73}{chapter*.4}% +\contentsline {chapter}{GNU Free Documentation License}{80}{chapter*.6}% diff --git a/packages.tex b/packages.tex index 8952cdb..2cd770d 100644 --- a/packages.tex +++ b/packages.tex @@ -9,6 +9,7 @@ % Free Documentation License". \documentclass[french, 11pt]{report} + \usepackage{silence} \usepackage[utf8]{inputenc} \usepackage{url} \usepackage{float} @@ -32,6 +33,8 @@ \usepackage[a4paper, portrait, margin=1.45cm]{geometry} % Set parameters +\WarningsOff + \setcounter{page}{0} \hypersetup{linktoc=all}