Merge 4.9.212 branch 'android-4.9-q' into tw10-android-4.9-q
Documentation/filesystems/fscrypt.rst arch/arm/common/Kconfig arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi arch/arm64/boot/dts/amd/amd-seattle-soc.dtsi arch/arm64/boot/dts/arm/juno-clocks.dtsi arch/arm64/boot/dts/broadcom/ns2.dtsi arch/arm64/boot/dts/lg/lg1312.dtsi arch/arm64/boot/dts/lg/lg1313.dtsi arch/arm64/boot/dts/marvell/armada-37xx.dtsi arch/arm64/boot/dts/nvidia/tegra210-p2180.dtsi arch/arm64/boot/dts/nvidia/tegra210-p2597.dtsi arch/arm64/boot/dts/nvidia/tegra210.dtsi arch/arm64/boot/dts/qcom/apq8016-sbc.dtsi arch/arm64/boot/dts/qcom/msm8996.dtsi arch/arm64/configs/ranchu64_defconfig arch/arm64/include/asm/cpucaps.h arch/arm64/kernel/cpufeature.c arch/arm64/kernel/traps.c arch/arm64/mm/mmu.c crypto/Makefile crypto/ablkcipher.c crypto/blkcipher.c crypto/testmgr.h crypto/zstd.c drivers/android/binder.c drivers/android/binder_alloc.c drivers/char/random.c drivers/clocksource/exynos_mct.c drivers/dma/pl330.c drivers/hid/hid-sony.c drivers/hid/uhid.c drivers/hid/usbhid/hiddev.c drivers/i2c/i2c-core.c drivers/md/dm-crypt.c drivers/media/v4l2-core/videobuf2-v4l2.c drivers/mmc/host/dw_mmc.c drivers/net/ethernet/broadcom/tg3.c drivers/net/usb/r8152.c drivers/scsi/scsi_logging.c drivers/scsi/sd.c drivers/scsi/ufs/ufshcd-pci.c drivers/scsi/ufs/ufshcd-pltfrm.c drivers/staging/android/Kconfig drivers/staging/android/ion/ion.c drivers/staging/android/ion/ion_priv.h drivers/staging/android/ion/ion_system_heap.c drivers/staging/android/lowmemorykiller.c drivers/tty/serial/samsung.c drivers/usb/dwc3/core.c drivers/usb/dwc3/gadget.c drivers/usb/host/xhci-hub.c drivers/video/fbdev/core/fbmon.c drivers/video/fbdev/core/modedb.c fs/crypto/fname.c fs/crypto/fscrypt_private.h fs/crypto/keyinfo.c fs/ext4/ialloc.c fs/ext4/namei.c fs/ext4/xattr.c fs/f2fs/checkpoint.c fs/f2fs/data.c fs/f2fs/debug.c fs/f2fs/dir.c fs/f2fs/f2fs.h fs/f2fs/file.c fs/f2fs/gc.c fs/f2fs/inline.c fs/f2fs/inode.c fs/f2fs/namei.c fs/f2fs/node.c fs/f2fs/recovery.c fs/f2fs/segment.c fs/f2fs/segment.h fs/f2fs/super.c fs/f2fs/sysfs.c fs/fat/dir.c fs/fat/fatent.c fs/file.c fs/namespace.c fs/pnode.c fs/proc/inode.c fs/proc/root.c fs/proc/task_mmu.c fs/sdcardfs/dentry.c fs/sdcardfs/derived_perm.c fs/sdcardfs/file.c fs/sdcardfs/inode.c fs/sdcardfs/lookup.c fs/sdcardfs/main.c fs/sdcardfs/sdcardfs.h fs/sdcardfs/super.c include/linux/blk_types.h include/linux/cpuhotplug.h include/linux/cred.h include/linux/fb.h include/linux/power_supply.h include/linux/sched.h include/linux/zstd.h include/trace/events/sched.h include/uapi/linux/android/binder.h init/Kconfig init/main.c kernel/bpf/hashtab.c kernel/cpu.c kernel/cred.c kernel/fork.c kernel/locking/spinlock_debug.c kernel/panic.c kernel/printk/printk.c kernel/sched/Makefile kernel/sched/core.c kernel/sched/fair.c kernel/sched/rt.c kernel/sched/walt.c kernel/sched/walt.h kernel/trace/trace.c lib/bug.c lib/list_debug.c lib/vsprintf.c lib/zstd/bitstream.h lib/zstd/compress.c lib/zstd/decompress.c lib/zstd/fse.h lib/zstd/fse_compress.c lib/zstd/fse_decompress.c lib/zstd/huf_compress.c lib/zstd/huf_decompress.c lib/zstd/zstd_internal.h mm/debug.c mm/filemap.c mm/rmap.c net/core/filter.c net/ipv4/sysctl_net_ipv4.c net/ipv4/sysfs_net_ipv4.c net/ipv4/tcp_input.c net/ipv4/tcp_output.c net/ipv4/udp.c net/ipv6/netfilter/nf_conntrack_reasm.c net/netfilter/Kconfig net/netfilter/Makefile net/netfilter/xt_qtaguid.c net/netfilter/xt_qtaguid_internal.h net/xfrm/xfrm_policy.c net/xfrm/xfrm_state.c scripts/checkpatch.pl security/selinux/hooks.c sound/core/compress_offload.c
This commit is contained in:
commit
af1d3ae977
4823 changed files with 128774 additions and 44493 deletions
|
@ -1,119 +0,0 @@
|
|||
What: /sys/block/zram<id>/num_reads
|
||||
Date: August 2015
|
||||
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
|
||||
Description:
|
||||
The num_reads file is read-only and specifies the number of
|
||||
reads (failed or successful) done on this device.
|
||||
Now accessible via zram<id>/stat node.
|
||||
|
||||
What: /sys/block/zram<id>/num_writes
|
||||
Date: August 2015
|
||||
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
|
||||
Description:
|
||||
The num_writes file is read-only and specifies the number of
|
||||
writes (failed or successful) done on this device.
|
||||
Now accessible via zram<id>/stat node.
|
||||
|
||||
What: /sys/block/zram<id>/invalid_io
|
||||
Date: August 2015
|
||||
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
|
||||
Description:
|
||||
The invalid_io file is read-only and specifies the number of
|
||||
non-page-size-aligned I/O requests issued to this device.
|
||||
Now accessible via zram<id>/io_stat node.
|
||||
|
||||
What: /sys/block/zram<id>/failed_reads
|
||||
Date: August 2015
|
||||
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
|
||||
Description:
|
||||
The failed_reads file is read-only and specifies the number of
|
||||
failed reads happened on this device.
|
||||
Now accessible via zram<id>/io_stat node.
|
||||
|
||||
What: /sys/block/zram<id>/failed_writes
|
||||
Date: August 2015
|
||||
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
|
||||
Description:
|
||||
The failed_writes file is read-only and specifies the number of
|
||||
failed writes happened on this device.
|
||||
Now accessible via zram<id>/io_stat node.
|
||||
|
||||
What: /sys/block/zram<id>/notify_free
|
||||
Date: August 2015
|
||||
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
|
||||
Description:
|
||||
The notify_free file is read-only. Depending on device usage
|
||||
scenario it may account a) the number of pages freed because
|
||||
of swap slot free notifications or b) the number of pages freed
|
||||
because of REQ_DISCARD requests sent by bio. The former ones
|
||||
are sent to a swap block device when a swap slot is freed, which
|
||||
implies that this disk is being used as a swap disk. The latter
|
||||
ones are sent by filesystem mounted with discard option,
|
||||
whenever some data blocks are getting discarded.
|
||||
Now accessible via zram<id>/io_stat node.
|
||||
|
||||
What: /sys/block/zram<id>/zero_pages
|
||||
Date: August 2015
|
||||
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
|
||||
Description:
|
||||
The zero_pages file is read-only and specifies number of zero
|
||||
filled pages written to this disk. No memory is allocated for
|
||||
such pages.
|
||||
Now accessible via zram<id>/mm_stat node.
|
||||
|
||||
What: /sys/block/zram<id>/orig_data_size
|
||||
Date: August 2015
|
||||
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
|
||||
Description:
|
||||
The orig_data_size file is read-only and specifies uncompressed
|
||||
size of data stored in this disk. This excludes zero-filled
|
||||
pages (zero_pages) since no memory is allocated for them.
|
||||
Unit: bytes
|
||||
Now accessible via zram<id>/mm_stat node.
|
||||
|
||||
What: /sys/block/zram<id>/compr_data_size
|
||||
Date: August 2015
|
||||
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
|
||||
Description:
|
||||
The compr_data_size file is read-only and specifies compressed
|
||||
size of data stored in this disk. So, compression ratio can be
|
||||
calculated using orig_data_size and this statistic.
|
||||
Unit: bytes
|
||||
Now accessible via zram<id>/mm_stat node.
|
||||
|
||||
What: /sys/block/zram<id>/mem_used_total
|
||||
Date: August 2015
|
||||
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
|
||||
Description:
|
||||
The mem_used_total file is read-only and specifies the amount
|
||||
of memory, including allocator fragmentation and metadata
|
||||
overhead, allocated for this disk. So, allocator space
|
||||
efficiency can be calculated using compr_data_size and this
|
||||
statistic.
|
||||
Unit: bytes
|
||||
Now accessible via zram<id>/mm_stat node.
|
||||
|
||||
What: /sys/block/zram<id>/mem_used_max
|
||||
Date: August 2015
|
||||
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
|
||||
Description:
|
||||
The mem_used_max file is read/write and specifies the amount
|
||||
of maximum memory zram have consumed to store compressed data.
|
||||
For resetting the value, you should write "0". Otherwise,
|
||||
you could see -EINVAL.
|
||||
Unit: bytes
|
||||
Downgraded to write-only node: so it's possible to set new
|
||||
value only; its current value is stored in zram<id>/mm_stat
|
||||
node.
|
||||
|
||||
What: /sys/block/zram<id>/mem_limit
|
||||
Date: August 2015
|
||||
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
|
||||
Description:
|
||||
The mem_limit file is read/write and specifies the maximum
|
||||
amount of memory ZRAM can use to store the compressed data.
|
||||
The limit could be changed in run time and "0" means disable
|
||||
the limit. No limit is the initial state. Unit: bytes
|
||||
Downgraded to write-only node: so it's possible to set new
|
||||
value only; its current value is stored in zram<id>/mm_stat
|
||||
node.
|
|
@ -22,41 +22,6 @@ Description:
|
|||
device. The reset operation frees all the memory associated
|
||||
with this device.
|
||||
|
||||
What: /sys/block/zram<id>/num_reads
|
||||
Date: August 2010
|
||||
Contact: Nitin Gupta <ngupta@vflare.org>
|
||||
Description:
|
||||
The num_reads file is read-only and specifies the number of
|
||||
reads (failed or successful) done on this device.
|
||||
|
||||
What: /sys/block/zram<id>/num_writes
|
||||
Date: August 2010
|
||||
Contact: Nitin Gupta <ngupta@vflare.org>
|
||||
Description:
|
||||
The num_writes file is read-only and specifies the number of
|
||||
writes (failed or successful) done on this device.
|
||||
|
||||
What: /sys/block/zram<id>/invalid_io
|
||||
Date: August 2010
|
||||
Contact: Nitin Gupta <ngupta@vflare.org>
|
||||
Description:
|
||||
The invalid_io file is read-only and specifies the number of
|
||||
non-page-size-aligned I/O requests issued to this device.
|
||||
|
||||
What: /sys/block/zram<id>/failed_reads
|
||||
Date: February 2014
|
||||
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
|
||||
Description:
|
||||
The failed_reads file is read-only and specifies the number of
|
||||
failed reads happened on this device.
|
||||
|
||||
What: /sys/block/zram<id>/failed_writes
|
||||
Date: February 2014
|
||||
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
|
||||
Description:
|
||||
The failed_writes file is read-only and specifies the number of
|
||||
failed writes happened on this device.
|
||||
|
||||
What: /sys/block/zram<id>/max_comp_streams
|
||||
Date: February 2014
|
||||
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
|
||||
|
@ -73,74 +38,24 @@ Description:
|
|||
available and selected compression algorithms, change
|
||||
compression algorithm selection.
|
||||
|
||||
What: /sys/block/zram<id>/notify_free
|
||||
Date: August 2010
|
||||
Contact: Nitin Gupta <ngupta@vflare.org>
|
||||
Description:
|
||||
The notify_free file is read-only. Depending on device usage
|
||||
scenario it may account a) the number of pages freed because
|
||||
of swap slot free notifications or b) the number of pages freed
|
||||
because of REQ_DISCARD requests sent by bio. The former ones
|
||||
are sent to a swap block device when a swap slot is freed, which
|
||||
implies that this disk is being used as a swap disk. The latter
|
||||
ones are sent by filesystem mounted with discard option,
|
||||
whenever some data blocks are getting discarded.
|
||||
|
||||
What: /sys/block/zram<id>/zero_pages
|
||||
Date: August 2010
|
||||
Contact: Nitin Gupta <ngupta@vflare.org>
|
||||
Description:
|
||||
The zero_pages file is read-only and specifies number of zero
|
||||
filled pages written to this disk. No memory is allocated for
|
||||
such pages.
|
||||
|
||||
What: /sys/block/zram<id>/orig_data_size
|
||||
Date: August 2010
|
||||
Contact: Nitin Gupta <ngupta@vflare.org>
|
||||
Description:
|
||||
The orig_data_size file is read-only and specifies uncompressed
|
||||
size of data stored in this disk. This excludes zero-filled
|
||||
pages (zero_pages) since no memory is allocated for them.
|
||||
Unit: bytes
|
||||
|
||||
What: /sys/block/zram<id>/compr_data_size
|
||||
Date: August 2010
|
||||
Contact: Nitin Gupta <ngupta@vflare.org>
|
||||
Description:
|
||||
The compr_data_size file is read-only and specifies compressed
|
||||
size of data stored in this disk. So, compression ratio can be
|
||||
calculated using orig_data_size and this statistic.
|
||||
Unit: bytes
|
||||
|
||||
What: /sys/block/zram<id>/mem_used_total
|
||||
Date: August 2010
|
||||
Contact: Nitin Gupta <ngupta@vflare.org>
|
||||
Description:
|
||||
The mem_used_total file is read-only and specifies the amount
|
||||
of memory, including allocator fragmentation and metadata
|
||||
overhead, allocated for this disk. So, allocator space
|
||||
efficiency can be calculated using compr_data_size and this
|
||||
statistic.
|
||||
Unit: bytes
|
||||
|
||||
What: /sys/block/zram<id>/mem_used_max
|
||||
Date: August 2014
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
The mem_used_max file is read/write and specifies the amount
|
||||
of maximum memory zram have consumed to store compressed data.
|
||||
For resetting the value, you should write "0". Otherwise,
|
||||
you could see -EINVAL.
|
||||
The mem_used_max file is write-only and is used to reset
|
||||
the counter of maximum memory zram have consumed to store
|
||||
compressed data. For resetting the value, you should write
|
||||
"0". Otherwise, you could see -EINVAL.
|
||||
Unit: bytes
|
||||
|
||||
What: /sys/block/zram<id>/mem_limit
|
||||
Date: August 2014
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
The mem_limit file is read/write and specifies the maximum
|
||||
amount of memory ZRAM can use to store the compressed data. The
|
||||
limit could be changed in run time and "0" means disable the
|
||||
limit. No limit is the initial state. Unit: bytes
|
||||
The mem_limit file is write-only and specifies the maximum
|
||||
amount of memory ZRAM can use to store the compressed data.
|
||||
The limit could be changed in run time and "0" means disable
|
||||
the limit. No limit is the initial state. Unit: bytes
|
||||
|
||||
What: /sys/block/zram<id>/compact
|
||||
Date: August 2015
|
||||
|
@ -175,3 +90,50 @@ Description:
|
|||
device's debugging info useful for kernel developers. Its
|
||||
format is not documented intentionally and may change
|
||||
anytime without any notice.
|
||||
|
||||
What: /sys/block/zram<id>/backing_dev
|
||||
Date: June 2017
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
The backing_dev file is read-write and set up backing
|
||||
device for zram to write incompressible pages.
|
||||
For using, user should enable CONFIG_ZRAM_WRITEBACK.
|
||||
|
||||
What: /sys/block/zram<id>/idle
|
||||
Date: November 2018
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
idle file is write-only and mark zram slot as idle.
|
||||
If system has mounted debugfs, user can see which slots
|
||||
are idle via /sys/kernel/debug/zram/zram<id>/block_state
|
||||
|
||||
What: /sys/block/zram<id>/writeback
|
||||
Date: November 2018
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
The writeback file is write-only and trigger idle and/or
|
||||
huge page writeback to backing device.
|
||||
|
||||
What: /sys/block/zram<id>/bd_stat
|
||||
Date: November 2018
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
The bd_stat file is read-only and represents backing device's
|
||||
statistics (bd_count, bd_reads, bd_writes) in a format
|
||||
similar to block layer statistics file format.
|
||||
|
||||
What: /sys/block/zram<id>/writeback_limit_enable
|
||||
Date: November 2018
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
The writeback_limit_enable file is read-write and specifies
|
||||
eanbe of writeback_limit feature. "1" means eable the feature.
|
||||
No limit "0" is the initial state.
|
||||
|
||||
What: /sys/block/zram<id>/writeback_limit
|
||||
Date: November 2018
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
The writeback_limit file is read-write and specifies the maximum
|
||||
amount of writeback ZRAM can do. The limit could be changed
|
||||
in run time.
|
||||
|
|
|
@ -4,7 +4,7 @@ KernelVersion: 3.10
|
|||
Contact: Samuel Ortiz <sameo@linux.intel.com>
|
||||
linux-mei@linux.intel.com
|
||||
Description: Stores the same MODALIAS value emitted by uevent
|
||||
Format: mei:<mei device name>:<device uuid>:
|
||||
Format: mei:<mei device name>:<device uuid>:<protocol version>
|
||||
|
||||
What: /sys/bus/mei/devices/.../name
|
||||
Date: May 2015
|
||||
|
|
|
@ -356,6 +356,10 @@ What: /sys/devices/system/cpu/vulnerabilities
|
|||
/sys/devices/system/cpu/vulnerabilities/spectre_v1
|
||||
/sys/devices/system/cpu/vulnerabilities/spectre_v2
|
||||
/sys/devices/system/cpu/vulnerabilities/spec_store_bypass
|
||||
/sys/devices/system/cpu/vulnerabilities/l1tf
|
||||
/sys/devices/system/cpu/vulnerabilities/mds
|
||||
/sys/devices/system/cpu/vulnerabilities/tsx_async_abort
|
||||
/sys/devices/system/cpu/vulnerabilities/itlb_multihit
|
||||
Date: January 2018
|
||||
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
|
||||
Description: Information about CPU vulnerabilities
|
||||
|
@ -367,3 +371,25 @@ Description: Information about CPU vulnerabilities
|
|||
"Not affected" CPU is not affected by the vulnerability
|
||||
"Vulnerable" CPU is affected and no mitigation in effect
|
||||
"Mitigation: $M" CPU is affected and mitigation $M is in effect
|
||||
|
||||
See also: Documentation/hw-vuln/index.rst
|
||||
|
||||
What: /sys/devices/system/cpu/smt
|
||||
/sys/devices/system/cpu/smt/active
|
||||
/sys/devices/system/cpu/smt/control
|
||||
Date: June 2018
|
||||
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
|
||||
Description: Control Symetric Multi Threading (SMT)
|
||||
|
||||
active: Tells whether SMT is active (enabled and siblings online)
|
||||
|
||||
control: Read/write interface to control SMT. Possible
|
||||
values:
|
||||
|
||||
"on" SMT is enabled
|
||||
"off" SMT is disabled
|
||||
"forceoff" SMT is force disabled. Cannot be changed.
|
||||
"notsupported" SMT is not supported by the CPU
|
||||
|
||||
If control status is "forceoff" or "notsupported" writes
|
||||
are rejected.
|
||||
|
|
|
@ -51,6 +51,14 @@ Description:
|
|||
Controls the dirty page count condition for the in-place-update
|
||||
policies.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/min_seq_blocks
|
||||
Date: August 2018
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description:
|
||||
Controls the dirty page count condition for batched sequential
|
||||
writes in ->writepages.
|
||||
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/min_hot_blocks
|
||||
Date: March 2017
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
|
@ -78,12 +86,28 @@ Description:
|
|||
The unit size is one block, now only support configuring in range
|
||||
of [1, 512].
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/umount_discard_timeout
|
||||
Date: January 2019
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description:
|
||||
Set timeout to issue discard commands during umount.
|
||||
Default: 5 secs
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/max_victim_search
|
||||
Date: January 2014
|
||||
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||
Description:
|
||||
Controls the number of trials to find a victim segment.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/migration_granularity
|
||||
Date: October 2018
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Description:
|
||||
Controls migration granularity of garbage collection on large
|
||||
section, it can let GC move partial segment{s} of one section
|
||||
in one GC cycle, so that dispersing heavy overhead GC to
|
||||
multiple lightweight one.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/dir_level
|
||||
Date: March 2014
|
||||
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||
|
@ -113,7 +137,22 @@ What: /sys/fs/f2fs/<disk>/idle_interval
|
|||
Date: January 2016
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description:
|
||||
Controls the idle timing.
|
||||
Controls the idle timing for all paths other than
|
||||
discard and gc path.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/discard_idle_interval
|
||||
Date: September 2018
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Contact: "Sahitya Tummala" <stummala@codeaurora.org>
|
||||
Description:
|
||||
Controls the idle timing for discard path.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/gc_idle_interval
|
||||
Date: September 2018
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Contact: "Sahitya Tummala" <stummala@codeaurora.org>
|
||||
Description:
|
||||
Controls the idle timing for gc path.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/iostat_enable
|
||||
Date: August 2017
|
||||
|
|
|
@ -33,7 +33,7 @@ GNU C 3.2 gcc --version
|
|||
GNU make 3.80 make --version
|
||||
binutils 2.12 ld -v
|
||||
util-linux 2.10o fdformat --version
|
||||
module-init-tools 0.9.10 depmod -V
|
||||
kmod 13 depmod -V
|
||||
e2fsprogs 1.41.4 e2fsck -V
|
||||
jfsutils 1.1.3 fsck.jfs -V
|
||||
reiserfsprogs 3.6.3 reiserfsck -V
|
||||
|
@ -143,12 +143,6 @@ is not build with ``CONFIG_KALLSYMS`` and you have no way to rebuild and
|
|||
reproduce the Oops with that option, then you can still decode that Oops
|
||||
with ksymoops.
|
||||
|
||||
Module-Init-Tools
|
||||
-----------------
|
||||
|
||||
A new module loader is now in the kernel that requires ``module-init-tools``
|
||||
to use. It is backward compatible with the 2.4.x series kernels.
|
||||
|
||||
Mkinitrd
|
||||
--------
|
||||
|
||||
|
@ -363,16 +357,17 @@ Util-linux
|
|||
|
||||
- <ftp://ftp.kernel.org/pub/linux/utils/util-linux/>
|
||||
|
||||
Kmod
|
||||
----
|
||||
|
||||
- <https://www.kernel.org/pub/linux/utils/kernel/kmod/>
|
||||
- <https://git.kernel.org/pub/scm/utils/kernel/kmod/kmod.git>
|
||||
|
||||
Ksymoops
|
||||
--------
|
||||
|
||||
- <ftp://ftp.kernel.org/pub/linux/utils/kernel/ksymoops/v2.4/>
|
||||
|
||||
Module-Init-Tools
|
||||
-----------------
|
||||
|
||||
- <ftp://ftp.kernel.org/pub/linux/kernel/people/rusty/modules/>
|
||||
|
||||
Mkinitrd
|
||||
--------
|
||||
|
||||
|
|
180
Documentation/accounting/psi.txt
Normal file
180
Documentation/accounting/psi.txt
Normal file
|
@ -0,0 +1,180 @@
|
|||
================================
|
||||
PSI - Pressure Stall Information
|
||||
================================
|
||||
|
||||
:Date: April, 2018
|
||||
:Author: Johannes Weiner <hannes@cmpxchg.org>
|
||||
|
||||
When CPU, memory or IO devices are contended, workloads experience
|
||||
latency spikes, throughput losses, and run the risk of OOM kills.
|
||||
|
||||
Without an accurate measure of such contention, users are forced to
|
||||
either play it safe and under-utilize their hardware resources, or
|
||||
roll the dice and frequently suffer the disruptions resulting from
|
||||
excessive overcommit.
|
||||
|
||||
The psi feature identifies and quantifies the disruptions caused by
|
||||
such resource crunches and the time impact it has on complex workloads
|
||||
or even entire systems.
|
||||
|
||||
Having an accurate measure of productivity losses caused by resource
|
||||
scarcity aids users in sizing workloads to hardware--or provisioning
|
||||
hardware according to workload demand.
|
||||
|
||||
As psi aggregates this information in realtime, systems can be managed
|
||||
dynamically using techniques such as load shedding, migrating jobs to
|
||||
other systems or data centers, or strategically pausing or killing low
|
||||
priority or restartable batch jobs.
|
||||
|
||||
This allows maximizing hardware utilization without sacrificing
|
||||
workload health or risking major disruptions such as OOM kills.
|
||||
|
||||
Pressure interface
|
||||
==================
|
||||
|
||||
Pressure information for each resource is exported through the
|
||||
respective file in /proc/pressure/ -- cpu, memory, and io.
|
||||
|
||||
The format for CPU is as such:
|
||||
|
||||
some avg10=0.00 avg60=0.00 avg300=0.00 total=0
|
||||
|
||||
and for memory and IO:
|
||||
|
||||
some avg10=0.00 avg60=0.00 avg300=0.00 total=0
|
||||
full avg10=0.00 avg60=0.00 avg300=0.00 total=0
|
||||
|
||||
The "some" line indicates the share of time in which at least some
|
||||
tasks are stalled on a given resource.
|
||||
|
||||
The "full" line indicates the share of time in which all non-idle
|
||||
tasks are stalled on a given resource simultaneously. In this state
|
||||
actual CPU cycles are going to waste, and a workload that spends
|
||||
extended time in this state is considered to be thrashing. This has
|
||||
severe impact on performance, and it's useful to distinguish this
|
||||
situation from a state where some tasks are stalled but the CPU is
|
||||
still doing productive work. As such, time spent in this subset of the
|
||||
stall state is tracked separately and exported in the "full" averages.
|
||||
|
||||
The ratios are tracked as recent trends over ten, sixty, and three
|
||||
hundred second windows, which gives insight into short term events as
|
||||
well as medium and long term trends. The total absolute stall time is
|
||||
tracked and exported as well, to allow detection of latency spikes
|
||||
which wouldn't necessarily make a dent in the time averages, or to
|
||||
average trends over custom time frames.
|
||||
|
||||
Monitoring for pressure thresholds
|
||||
==================================
|
||||
|
||||
Users can register triggers and use poll() to be woken up when resource
|
||||
pressure exceeds certain thresholds.
|
||||
|
||||
A trigger describes the maximum cumulative stall time over a specific
|
||||
time window, e.g. 100ms of total stall time within any 500ms window to
|
||||
generate a wakeup event.
|
||||
|
||||
To register a trigger user has to open psi interface file under
|
||||
/proc/pressure/ representing the resource to be monitored and write the
|
||||
desired threshold and time window. The open file descriptor should be
|
||||
used to wait for trigger events using select(), poll() or epoll().
|
||||
The following format is used:
|
||||
|
||||
<some|full> <stall amount in us> <time window in us>
|
||||
|
||||
For example writing "some 150000 1000000" into /proc/pressure/memory
|
||||
would add 150ms threshold for partial memory stall measured within
|
||||
1sec time window. Writing "full 50000 1000000" into /proc/pressure/io
|
||||
would add 50ms threshold for full io stall measured within 1sec time window.
|
||||
|
||||
Triggers can be set on more than one psi metric and more than one trigger
|
||||
for the same psi metric can be specified. However for each trigger a separate
|
||||
file descriptor is required to be able to poll it separately from others,
|
||||
therefore for each trigger a separate open() syscall should be made even
|
||||
when opening the same psi interface file.
|
||||
|
||||
Monitors activate only when system enters stall state for the monitored
|
||||
psi metric and deactivates upon exit from the stall state. While system is
|
||||
in the stall state psi signal growth is monitored at a rate of 10 times per
|
||||
tracking window.
|
||||
|
||||
The kernel accepts window sizes ranging from 500ms to 10s, therefore min
|
||||
monitoring update interval is 50ms and max is 1s. Min limit is set to
|
||||
prevent overly frequent polling. Max limit is chosen as a high enough number
|
||||
after which monitors are most likely not needed and psi averages can be used
|
||||
instead.
|
||||
|
||||
When activated, psi monitor stays active for at least the duration of one
|
||||
tracking window to avoid repeated activations/deactivations when system is
|
||||
bouncing in and out of the stall state.
|
||||
|
||||
Notifications to the userspace are rate-limited to one per tracking window.
|
||||
|
||||
The trigger will de-register when the file descriptor used to define the
|
||||
trigger is closed.
|
||||
|
||||
Userspace monitor usage example
|
||||
===============================
|
||||
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <stdio.h>
|
||||
#include <poll.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
/*
|
||||
* Monitor memory partial stall with 1s tracking window size
|
||||
* and 150ms threshold.
|
||||
*/
|
||||
int main() {
|
||||
const char trig[] = "some 150000 1000000";
|
||||
struct pollfd fds;
|
||||
int n;
|
||||
|
||||
fds.fd = open("/proc/pressure/memory", O_RDWR | O_NONBLOCK);
|
||||
if (fds.fd < 0) {
|
||||
printf("/proc/pressure/memory open error: %s\n",
|
||||
strerror(errno));
|
||||
return 1;
|
||||
}
|
||||
fds.events = POLLPRI;
|
||||
|
||||
if (write(fds.fd, trig, strlen(trig) + 1) < 0) {
|
||||
printf("/proc/pressure/memory write error: %s\n",
|
||||
strerror(errno));
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("waiting for events...\n");
|
||||
while (1) {
|
||||
n = poll(&fds, 1, -1);
|
||||
if (n < 0) {
|
||||
printf("poll error: %s\n", strerror(errno));
|
||||
return 1;
|
||||
}
|
||||
if (fds.revents & POLLERR) {
|
||||
printf("got POLLERR, event source is gone\n");
|
||||
return 0;
|
||||
}
|
||||
if (fds.revents & POLLPRI) {
|
||||
printf("event triggered!\n");
|
||||
} else {
|
||||
printf("unknown event received: 0x%x\n", fds.revents);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
Cgroup2 interface
|
||||
=================
|
||||
|
||||
In a system with a CONFIG_CGROUP=y kernel and the cgroup2 filesystem
|
||||
mounted, pressure stall information is also tracked for tasks grouped
|
||||
into cgroups. Each subdirectory in the cgroupfs mountpoint contains
|
||||
cpu.pressure, memory.pressure, and io.pressure files; the format is
|
||||
the same as the /proc/pressure/ files.
|
||||
|
||||
Per-cgroup psi monitors can be specified and used the same way as
|
||||
system-wide ones.
|
|
@ -6,7 +6,7 @@ TL;DR summary
|
|||
* Use only NEON instructions, or VFP instructions that don't rely on support
|
||||
code
|
||||
* Isolate your NEON code in a separate compilation unit, and compile it with
|
||||
'-mfpu=neon -mfloat-abi=softfp'
|
||||
'-march=armv7-a -mfpu=neon -mfloat-abi=softfp'
|
||||
* Put kernel_neon_begin() and kernel_neon_end() calls around the calls into your
|
||||
NEON code
|
||||
* Don't sleep in your NEON code, and be aware that it will be executed with
|
||||
|
@ -87,7 +87,7 @@ instructions appearing in unexpected places if no special care is taken.
|
|||
Therefore, the recommended and only supported way of using NEON/VFP in the
|
||||
kernel is by adhering to the following rules:
|
||||
* isolate the NEON code in a separate compilation unit and compile it with
|
||||
'-mfpu=neon -mfloat-abi=softfp';
|
||||
'-march=armv7-a -mfpu=neon -mfloat-abi=softfp';
|
||||
* issue the calls to kernel_neon_begin(), kernel_neon_end() as well as the calls
|
||||
into the unit containing the NEON code from a compilation unit which is *not*
|
||||
built with the GCC flag '-mfpu=neon' set.
|
||||
|
|
|
@ -156,47 +156,24 @@ Per-device statistics are exported as various nodes under /sys/block/zram<id>/
|
|||
A brief description of exported device attributes. For more details please
|
||||
read Documentation/ABI/testing/sysfs-block-zram.
|
||||
|
||||
Name access description
|
||||
---- ------ -----------
|
||||
disksize RW show and set the device's disk size
|
||||
initstate RO shows the initialization state of the device
|
||||
reset WO trigger device reset
|
||||
num_reads RO the number of reads
|
||||
failed_reads RO the number of failed reads
|
||||
num_write RO the number of writes
|
||||
failed_writes RO the number of failed writes
|
||||
invalid_io RO the number of non-page-size-aligned I/O requests
|
||||
max_comp_streams RW the number of possible concurrent compress operations
|
||||
comp_algorithm RW show and change the compression algorithm
|
||||
notify_free RO the number of notifications to free pages (either
|
||||
slot free notifications or REQ_DISCARD requests)
|
||||
zero_pages RO the number of zero filled pages written to this disk
|
||||
orig_data_size RO uncompressed size of data stored in this disk
|
||||
compr_data_size RO compressed size of data stored in this disk
|
||||
mem_used_total RO the amount of memory allocated for this disk
|
||||
mem_used_max RW the maximum amount of memory zram have consumed to
|
||||
store the data (to reset this counter to the actual
|
||||
current value, write 1 to this attribute)
|
||||
mem_limit RW the maximum amount of memory ZRAM can use to store
|
||||
the compressed data
|
||||
pages_compacted RO the number of pages freed during compaction
|
||||
(available only via zram<id>/mm_stat node)
|
||||
compact WO trigger memory compaction
|
||||
debug_stat RO this file is used for zram debugging purposes
|
||||
Name access description
|
||||
---- ------ -----------
|
||||
disksize RW show and set the device's disk size
|
||||
initstate RO shows the initialization state of the device
|
||||
reset WO trigger device reset
|
||||
mem_used_max WO reset the `mem_used_max' counter (see later)
|
||||
mem_limit WO specifies the maximum amount of memory ZRAM can use
|
||||
to store the compressed data
|
||||
writeback_limit WO specifies the maximum amount of write IO zram can
|
||||
write out to backing device as 4KB unit
|
||||
writeback_limit_enable RW show and set writeback_limit feature
|
||||
max_comp_streams RW the number of possible concurrent compress operations
|
||||
comp_algorithm RW show and change the compression algorithm
|
||||
compact WO trigger memory compaction
|
||||
debug_stat RO this file is used for zram debugging purposes
|
||||
backing_dev RW set up backend storage for zram to write out
|
||||
idle WO mark allocated slot as idle
|
||||
|
||||
WARNING
|
||||
=======
|
||||
per-stat sysfs attributes are considered to be deprecated.
|
||||
The basic strategy is:
|
||||
-- the existing RW nodes will be downgraded to WO nodes (in linux 4.11)
|
||||
-- deprecated RO sysfs nodes will eventually be removed (in linux 4.11)
|
||||
|
||||
The list of deprecated attributes can be found here:
|
||||
Documentation/ABI/obsolete/sysfs-block-zram
|
||||
|
||||
Basically, every attribute that has its own read accessible sysfs node
|
||||
(e.g. num_reads) *AND* is accessible via one of the stat files (zram<id>/stat
|
||||
or zram<id>/io_stat or zram<id>/mm_stat) is considered to be deprecated.
|
||||
|
||||
User space is advised to use the following files to read the device statistics.
|
||||
|
||||
|
@ -211,22 +188,52 @@ The stat file represents device's I/O statistics not accounted by block
|
|||
layer and, thus, not available in zram<id>/stat file. It consists of a
|
||||
single line of text and contains the following stats separated by
|
||||
whitespace:
|
||||
failed_reads
|
||||
failed_writes
|
||||
invalid_io
|
||||
notify_free
|
||||
failed_reads the number of failed reads
|
||||
failed_writes the number of failed writes
|
||||
invalid_io the number of non-page-size-aligned I/O requests
|
||||
notify_free Depending on device usage scenario it may account
|
||||
a) the number of pages freed because of swap slot free
|
||||
notifications or b) the number of pages freed because of
|
||||
REQ_DISCARD requests sent by bio. The former ones are
|
||||
sent to a swap block device when a swap slot is freed,
|
||||
which implies that this disk is being used as a swap disk.
|
||||
The latter ones are sent by filesystem mounted with
|
||||
discard option, whenever some data blocks are getting
|
||||
discarded.
|
||||
|
||||
File /sys/block/zram<id>/mm_stat
|
||||
|
||||
The stat file represents device's mm statistics. It consists of a single
|
||||
line of text and contains the following stats separated by whitespace:
|
||||
orig_data_size
|
||||
compr_data_size
|
||||
mem_used_total
|
||||
mem_limit
|
||||
mem_used_max
|
||||
zero_pages
|
||||
num_migrated
|
||||
orig_data_size uncompressed size of data stored in this disk.
|
||||
This excludes same-element-filled pages (same_pages) since
|
||||
no memory is allocated for them.
|
||||
Unit: bytes
|
||||
compr_data_size compressed size of data stored in this disk
|
||||
mem_used_total the amount of memory allocated for this disk. This
|
||||
includes allocator fragmentation and metadata overhead,
|
||||
allocated for this disk. So, allocator space efficiency
|
||||
can be calculated using compr_data_size and this statistic.
|
||||
Unit: bytes
|
||||
mem_limit the maximum amount of memory ZRAM can use to store
|
||||
the compressed data
|
||||
mem_used_max the maximum amount of memory zram have consumed to
|
||||
store the data
|
||||
same_pages the number of same element filled pages written to this disk.
|
||||
No memory is allocated for such pages.
|
||||
pages_compacted the number of pages freed during compaction
|
||||
huge_pages the number of incompressible pages
|
||||
|
||||
File /sys/block/zram<id>/bd_stat
|
||||
|
||||
The stat file represents device's backing device statistics. It consists of
|
||||
a single line of text and contains the following stats separated by whitespace:
|
||||
bd_count size of data written in backing device.
|
||||
Unit: 4K bytes
|
||||
bd_reads the number of reads from backing device
|
||||
Unit: 4K bytes
|
||||
bd_writes the number of writes to backing device
|
||||
Unit: 4K bytes
|
||||
|
||||
9) Deactivate:
|
||||
swapoff /dev/zram0
|
||||
|
@ -241,5 +248,108 @@ line of text and contains the following stats separated by whitespace:
|
|||
resets the disksize to zero. You must set the disksize again
|
||||
before reusing the device.
|
||||
|
||||
* Optional Feature
|
||||
|
||||
= writeback
|
||||
|
||||
With CONFIG_ZRAM_WRITEBACK, zram can write idle/incompressible page
|
||||
to backing storage rather than keeping it in memory.
|
||||
To use the feature, admin should set up backing device via
|
||||
|
||||
"echo /dev/sda5 > /sys/block/zramX/backing_dev"
|
||||
|
||||
before disksize setting. It supports only partition at this moment.
|
||||
If admin want to use incompressible page writeback, they could do via
|
||||
|
||||
"echo huge > /sys/block/zramX/write"
|
||||
|
||||
To use idle page writeback, first, user need to declare zram pages
|
||||
as idle.
|
||||
|
||||
"echo all > /sys/block/zramX/idle"
|
||||
|
||||
From now on, any pages on zram are idle pages. The idle mark
|
||||
will be removed until someone request access of the block.
|
||||
IOW, unless there is access request, those pages are still idle pages.
|
||||
|
||||
Admin can request writeback of those idle pages at right timing via
|
||||
|
||||
"echo idle > /sys/block/zramX/writeback"
|
||||
|
||||
With the command, zram writeback idle pages from memory to the storage.
|
||||
|
||||
If there are lots of write IO with flash device, potentially, it has
|
||||
flash wearout problem so that admin needs to design write limitation
|
||||
to guarantee storage health for entire product life.
|
||||
|
||||
To overcome the concern, zram supports "writeback_limit" feature.
|
||||
The "writeback_limit_enable"'s default value is 0 so that it doesn't limit
|
||||
any writeback. IOW, if admin want to apply writeback budget, he should
|
||||
enable writeback_limit_enable via
|
||||
|
||||
$ echo 1 > /sys/block/zramX/writeback_limit_enable
|
||||
|
||||
Once writeback_limit_enable is set, zram doesn't allow any writeback
|
||||
until admin set the budget via /sys/block/zramX/writeback_limit.
|
||||
|
||||
(If admin doesn't enable writeback_limit_enable, writeback_limit's value
|
||||
assigned via /sys/block/zramX/writeback_limit is meaninless.)
|
||||
|
||||
If admin want to limit writeback as per-day 400M, he could do it
|
||||
like below.
|
||||
|
||||
$ MB_SHIFT=20
|
||||
$ 4K_SHIFT=12
|
||||
$ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
|
||||
/sys/block/zram0/writeback_limit.
|
||||
$ echo 1 > /sys/block/zram0/writeback_limit_enable
|
||||
|
||||
If admin want to allow further write again once the bugdet is exausted,
|
||||
he could do it like below
|
||||
|
||||
$ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
|
||||
/sys/block/zram0/writeback_limit
|
||||
|
||||
If admin want to see remaining writeback budget since he set,
|
||||
|
||||
$ cat /sys/block/zramX/writeback_limit
|
||||
|
||||
If admin want to disable writeback limit, he could do
|
||||
|
||||
$ echo 0 > /sys/block/zramX/writeback_limit_enable
|
||||
|
||||
The writeback_limit count will reset whenever you reset zram(e.g.,
|
||||
system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of
|
||||
writeback happened until you reset the zram to allocate extra writeback
|
||||
budget in next setting is user's job.
|
||||
|
||||
If admin want to measure writeback count in a certain period, he could
|
||||
know it via /sys/block/zram0/bd_stat's 3rd column.
|
||||
|
||||
= memory tracking
|
||||
|
||||
With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the
|
||||
zram block. It could be useful to catch cold or incompressible
|
||||
pages of the process with*pagemap.
|
||||
If you enable the feature, you could see block state via
|
||||
/sys/kernel/debug/zram/zram0/block_state". The output is as follows,
|
||||
|
||||
300 75.033841 .wh.
|
||||
301 63.806904 s...
|
||||
302 63.806919 ..hi
|
||||
|
||||
First column is zram's block index.
|
||||
Second column is access time since the system was booted
|
||||
Third column is state of the block.
|
||||
(s: same page
|
||||
w: written page to backing store
|
||||
h: huge page
|
||||
i: idle page)
|
||||
|
||||
First line of above example says 300th block is accessed at 75.033841sec
|
||||
and the block's state is huge so it is written back to the backing
|
||||
storage. It's a debugging feature so anyone shouldn't rely on it to work
|
||||
properly.
|
||||
|
||||
Nitin Gupta
|
||||
ngupta@vflare.org
|
||||
|
|
|
@ -717,6 +717,12 @@ All time durations are in microseconds.
|
|||
$PERIOD duration. If only one number is written, $MAX is
|
||||
updated.
|
||||
|
||||
cpu.pressure
|
||||
A read-only nested-key file which exists on non-root cgroups.
|
||||
|
||||
Shows pressure stall information for CPU. See
|
||||
Documentation/accounting/psi.txt for details.
|
||||
|
||||
|
||||
5-2. Memory
|
||||
|
||||
|
@ -925,6 +931,12 @@ PAGE_SIZE multiple when read back.
|
|||
Swap usage hard limit. If a cgroup's swap usage reaches this
|
||||
limit, anonymous meomry of the cgroup will not be swapped out.
|
||||
|
||||
memory.pressure
|
||||
A read-only nested-key file which exists on non-root cgroups.
|
||||
|
||||
Shows pressure stall information for memory. See
|
||||
Documentation/accounting/psi.txt for details.
|
||||
|
||||
|
||||
5-2-2. Usage Guidelines
|
||||
|
||||
|
@ -1055,6 +1067,12 @@ blk-mq devices.
|
|||
|
||||
8:16 rbps=2097152 wbps=max riops=max wiops=max
|
||||
|
||||
io.pressure
|
||||
A read-only nested-key file which exists on non-root cgroups.
|
||||
|
||||
Shows pressure stall information for IO. See
|
||||
Documentation/accounting/psi.txt for details.
|
||||
|
||||
|
||||
5-3-2. Writeback
|
||||
|
||||
|
|
|
@ -37,7 +37,7 @@ from load_config import loadConfig
|
|||
extensions = ['kernel-doc', 'rstFlatTable', 'kernel_include', 'cdomain']
|
||||
|
||||
# The name of the math extension changed on Sphinx 1.4
|
||||
if major == 1 and minor > 3:
|
||||
if (major == 1 and minor > 3) or (major > 1):
|
||||
extensions.append("sphinx.ext.imgmath")
|
||||
else:
|
||||
extensions.append("sphinx.ext.pngmath")
|
||||
|
|
99
Documentation/device-mapper/dm-bow.txt
Normal file
99
Documentation/device-mapper/dm-bow.txt
Normal file
|
@ -0,0 +1,99 @@
|
|||
dm_bow (backup on write)
|
||||
========================
|
||||
|
||||
dm_bow is a device mapper driver that uses the free space on a device to back up
|
||||
data that is overwritten. The changes can then be committed by a simple state
|
||||
change, or rolled back by removing the dm_bow device and running a command line
|
||||
utility over the underlying device.
|
||||
|
||||
dm_bow has three states, set by writing ‘1’ or ‘2’ to /sys/block/dm-?/bow/state.
|
||||
It is only possible to go from state 0 (initial state) to state 1, and then from
|
||||
state 1 to state 2.
|
||||
|
||||
State 0: dm_bow collects all trims to the device and assumes that these mark
|
||||
free space on the overlying file system that can be safely used. Typically the
|
||||
mount code would create the dm_bow device, mount the file system, call the
|
||||
FITRIM ioctl on the file system then switch to state 1. These trims are not
|
||||
propagated to the underlying device.
|
||||
|
||||
State 1: All writes to the device cause the underlying data to be backed up to
|
||||
the free (trimmed) area as needed in such a way as they can be restored.
|
||||
However, the writes, with one exception, then happen exactly as they would
|
||||
without dm_bow, so the device is always in a good final state. The exception is
|
||||
that sector 0 is used to keep a log of the latest changes, both to indicate that
|
||||
we are in this state and to allow rollback. See below for all details. If there
|
||||
isn't enough free space, writes are failed with -ENOSPC.
|
||||
|
||||
State 2: The transition to state 2 triggers replacing the special sector 0 with
|
||||
the normal sector 0, and the freeing of all state information. dm_bow then
|
||||
becomes a pass-through driver, allowing the device to continue to be used with
|
||||
minimal performance impact.
|
||||
|
||||
Usage
|
||||
=====
|
||||
dm-bow takes one command line parameter, the name of the underlying device.
|
||||
|
||||
dm-bow will typically be used in the following way. dm-bow will be loaded with a
|
||||
suitable underlying device and the resultant device will be mounted. A file
|
||||
system trim will be issued via the FITRIM ioctl, then the device will be
|
||||
switched to state 1. The file system will now be used as normal. At some point,
|
||||
the changes can either be committed by switching to state 2, or rolled back by
|
||||
unmounting the file system, removing the dm-bow device and running the command
|
||||
line utility. Note that rebooting the device will be equivalent to unmounting
|
||||
and removing, but the command line utility must still be run
|
||||
|
||||
Details of operation in state 1
|
||||
===============================
|
||||
|
||||
dm_bow maintains a type for all sectors. A sector can be any of:
|
||||
|
||||
SECTOR0
|
||||
SECTOR0_CURRENT
|
||||
UNCHANGED
|
||||
FREE
|
||||
CHANGED
|
||||
BACKUP
|
||||
|
||||
SECTOR0 is the first sector on the device, and is used to hold the log of
|
||||
changes. This is the one exception.
|
||||
|
||||
SECTOR0_CURRENT is a sector picked from the FREE sectors, and is where reads and
|
||||
writes from the true sector zero are redirected to. Note that like any backup
|
||||
sector, if the sector is written to directly, it must be moved again.
|
||||
|
||||
UNCHANGED means that the sector has not been changed since we entered state 1.
|
||||
Thus if it is written to or trimmed, the contents must first be backed up.
|
||||
|
||||
FREE means that the sector was trimmed in state 0 and has not yet been written
|
||||
to or used for backup. On being written to, a FREE sector is changed to CHANGED.
|
||||
|
||||
CHANGED means that the sector has been modified, and can be further modified
|
||||
without further backup.
|
||||
|
||||
BACKUP means that this is a free sector being used as a backup. On being written
|
||||
to, the contents must first be backed up again.
|
||||
|
||||
All backup operations are logged to the first sector. The log sector has the
|
||||
format:
|
||||
--------------------------------------------------------
|
||||
| Magic | Count | Sequence | Log entry | Log entry | …
|
||||
--------------------------------------------------------
|
||||
|
||||
Magic is a magic number. Count is the number of log entries. Sequence is 0
|
||||
initially. A log entry is
|
||||
|
||||
-----------------------------------
|
||||
| Source | Dest | Size | Checksum |
|
||||
-----------------------------------
|
||||
|
||||
When SECTOR0 is full, the log sector is backed up and another empty log sector
|
||||
created with sequence number one higher. The first entry in any log entry with
|
||||
sequence > 0 therefore must be the log of the backing up of the previous log
|
||||
sector. Note that sequence is not strictly needed, but is a useful sanity check
|
||||
and potentially limits the time spent trying to restore a corrupted snapshot.
|
||||
|
||||
On entering state 1, dm_bow has a list of free sectors. All other sectors are
|
||||
unchanged. Sector0_current is selected from the free sectors and the contents of
|
||||
sector 0 are copied there. The sector 0 is backed up, which triggers the first
|
||||
log entry to be written.
|
||||
|
|
@ -6,7 +6,8 @@ Required properties:
|
|||
|
||||
"atmel,24c00", "atmel,24c01", "atmel,24c02", "atmel,24c04",
|
||||
"atmel,24c08", "atmel,24c16", "atmel,24c32", "atmel,24c64",
|
||||
"atmel,24c128", "atmel,24c256", "atmel,24c512", "atmel,24c1024"
|
||||
"atmel,24c128", "atmel,24c256", "atmel,24c512", "atmel,24c1024",
|
||||
"atmel,24c2048"
|
||||
|
||||
"catalyst,24c32"
|
||||
|
||||
|
@ -17,7 +18,7 @@ Required properties:
|
|||
If there is no specific driver for <manufacturer>, a generic
|
||||
driver based on <type> is selected. Possible types are:
|
||||
"24c00", "24c01", "24c02", "24c04", "24c08", "24c16", "24c32", "24c64",
|
||||
"24c128", "24c256", "24c512", "24c1024", "spd"
|
||||
"24c128", "24c256", "24c512", "24c1024", "24c2048", "spd"
|
||||
|
||||
- reg : the I2C address of the EEPROM
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ Required properties:
|
|||
- compatible: Should be one of the following:
|
||||
- "microchip,mcp2510" for MCP2510.
|
||||
- "microchip,mcp2515" for MCP2515.
|
||||
- "microchip,mcp25625" for MCP25625.
|
||||
- reg: SPI chip select.
|
||||
- clocks: The clock feeding the CAN controller.
|
||||
- interrupt-parent: The parent interrupt controller.
|
||||
|
|
|
@ -10,6 +10,7 @@ Required properties:
|
|||
Use "cdns,pc302-gem" for Picochip picoXcell pc302 and later devices based on
|
||||
the Cadence GEM, or the generic form: "cdns,gem".
|
||||
Use "atmel,sama5d2-gem" for the GEM IP (10/100) available on Atmel sama5d2 SoCs.
|
||||
Use "atmel,sama5d3-macb" for the 10/100Mbit IP available on Atmel sama5d3 SoCs.
|
||||
Use "atmel,sama5d3-gem" for the Gigabit IP available on Atmel sama5d3 SoCs.
|
||||
Use "atmel,sama5d4-gem" for the GEM IP (10/100) available on Atmel sama5d4 SoCs.
|
||||
Use "cdns,zynq-gem" Xilinx Zynq-7xxx SoC.
|
||||
|
|
|
@ -27,4 +27,4 @@ and valid to enable charging:
|
|||
|
||||
- "abracon,tc-diode": should be "standard" (0.6V) or "schottky" (0.3V)
|
||||
- "abracon,tc-resistor": should be <0>, <3>, <6> or <11>. 0 disables the output
|
||||
resistor, the other values are in ohm.
|
||||
resistor, the other values are in kOhm.
|
||||
|
|
|
@ -8,6 +8,6 @@ Required properties:
|
|||
Example:
|
||||
serial@12000 {
|
||||
compatible = "marvell,armada-3700-uart";
|
||||
reg = <0x12000 0x400>;
|
||||
reg = <0x12000 0x200>;
|
||||
interrupts = <43>;
|
||||
};
|
||||
|
|
|
@ -125,6 +125,9 @@ active_logs=%u Support configuring the number of active logs. In the
|
|||
disable_ext_identify Disable the extension list configured by mkfs, so f2fs
|
||||
does not aware of cold files such as media files.
|
||||
inline_xattr Enable the inline xattrs feature.
|
||||
noinline_xattr Disable the inline xattrs feature.
|
||||
inline_xattr_size=%u Support configuring inline xattr size, it depends on
|
||||
flexible inline xattr feature.
|
||||
inline_data Enable the inline data feature: New created small(<~3.4k)
|
||||
files can be written into inode block.
|
||||
inline_dentry Enable the inline dir feature: data in new created
|
||||
|
@ -154,6 +157,27 @@ noinline_data Disable the inline data feature, inline data feature is
|
|||
enabled by default.
|
||||
data_flush Enable data flushing before checkpoint in order to
|
||||
persist data of regular and symlink.
|
||||
fault_injection=%d Enable fault injection in all supported types with
|
||||
specified injection rate.
|
||||
fault_type=%d Support configuring fault injection type, should be
|
||||
enabled with fault_injection option, fault type value
|
||||
is shown below, it supports single or combined type.
|
||||
Type_Name Type_Value
|
||||
FAULT_KMALLOC 0x000000001
|
||||
FAULT_KVMALLOC 0x000000002
|
||||
FAULT_PAGE_ALLOC 0x000000004
|
||||
FAULT_PAGE_GET 0x000000008
|
||||
FAULT_ALLOC_BIO 0x000000010
|
||||
FAULT_ALLOC_NID 0x000000020
|
||||
FAULT_ORPHAN 0x000000040
|
||||
FAULT_BLOCK 0x000000080
|
||||
FAULT_DIR_DEPTH 0x000000100
|
||||
FAULT_EVICT_INODE 0x000000200
|
||||
FAULT_TRUNCATE 0x000000400
|
||||
FAULT_READ_IO 0x000000800
|
||||
FAULT_CHECKPOINT 0x000001000
|
||||
FAULT_DISCARD 0x000002000
|
||||
FAULT_WRITE_IO 0x000004000
|
||||
mode=%s Control block allocation mode which supports "adaptive"
|
||||
and "lfs". In "lfs" mode, there should be no random
|
||||
writes towards main area.
|
||||
|
@ -190,6 +214,11 @@ fsync_mode=%s Control the policy of fsync. Currently supports "posix",
|
|||
non-atomic files likewise "nobarrier" mount option.
|
||||
test_dummy_encryption Enable dummy encryption, which provides a fake fscrypt
|
||||
context. The fake fscrypt context is used by xfstests.
|
||||
checkpoint=%s Set to "disable" to turn off checkpointing. Set to "enable"
|
||||
to reenable checkpointing. Is enabled by default. While
|
||||
disabled, any unmounting or unexpected shutdowns will cause
|
||||
the filesystem contents to appear as they did when the
|
||||
filesystem was mounted with that option.
|
||||
|
||||
================================================================================
|
||||
DEBUGFS ENTRIES
|
||||
|
|
641
Documentation/filesystems/fscrypt.rst
Normal file
641
Documentation/filesystems/fscrypt.rst
Normal file
|
@ -0,0 +1,641 @@
|
|||
=====================================
|
||||
Filesystem-level encryption (fscrypt)
|
||||
=====================================
|
||||
|
||||
Introduction
|
||||
============
|
||||
|
||||
fscrypt is a library which filesystems can hook into to support
|
||||
transparent encryption of files and directories.
|
||||
|
||||
Note: "fscrypt" in this document refers to the kernel-level portion,
|
||||
implemented in ``fs/crypto/``, as opposed to the userspace tool
|
||||
`fscrypt <https://github.com/google/fscrypt>`_. This document only
|
||||
covers the kernel-level portion. For command-line examples of how to
|
||||
use encryption, see the documentation for the userspace tool `fscrypt
|
||||
<https://github.com/google/fscrypt>`_. Also, it is recommended to use
|
||||
the fscrypt userspace tool, or other existing userspace tools such as
|
||||
`fscryptctl <https://github.com/google/fscryptctl>`_ or `Android's key
|
||||
management system
|
||||
<https://source.android.com/security/encryption/file-based>`_, over
|
||||
using the kernel's API directly. Using existing tools reduces the
|
||||
chance of introducing your own security bugs. (Nevertheless, for
|
||||
completeness this documentation covers the kernel's API anyway.)
|
||||
|
||||
Unlike dm-crypt, fscrypt operates at the filesystem level rather than
|
||||
at the block device level. This allows it to encrypt different files
|
||||
with different keys and to have unencrypted files on the same
|
||||
filesystem. This is useful for multi-user systems where each user's
|
||||
data-at-rest needs to be cryptographically isolated from the others.
|
||||
However, except for filenames, fscrypt does not encrypt filesystem
|
||||
metadata.
|
||||
|
||||
Unlike eCryptfs, which is a stacked filesystem, fscrypt is integrated
|
||||
directly into supported filesystems --- currently ext4, F2FS, and
|
||||
UBIFS. This allows encrypted files to be read and written without
|
||||
caching both the decrypted and encrypted pages in the pagecache,
|
||||
thereby nearly halving the memory used and bringing it in line with
|
||||
unencrypted files. Similarly, half as many dentries and inodes are
|
||||
needed. eCryptfs also limits encrypted filenames to 143 bytes,
|
||||
causing application compatibility issues; fscrypt allows the full 255
|
||||
bytes (NAME_MAX). Finally, unlike eCryptfs, the fscrypt API can be
|
||||
used by unprivileged users, with no need to mount anything.
|
||||
|
||||
fscrypt does not support encrypting files in-place. Instead, it
|
||||
supports marking an empty directory as encrypted. Then, after
|
||||
userspace provides the key, all regular files, directories, and
|
||||
symbolic links created in that directory tree are transparently
|
||||
encrypted.
|
||||
|
||||
Threat model
|
||||
============
|
||||
|
||||
Offline attacks
|
||||
---------------
|
||||
|
||||
Provided that userspace chooses a strong encryption key, fscrypt
|
||||
protects the confidentiality of file contents and filenames in the
|
||||
event of a single point-in-time permanent offline compromise of the
|
||||
block device content. fscrypt does not protect the confidentiality of
|
||||
non-filename metadata, e.g. file sizes, file permissions, file
|
||||
timestamps, and extended attributes. Also, the existence and location
|
||||
of holes (unallocated blocks which logically contain all zeroes) in
|
||||
files is not protected.
|
||||
|
||||
fscrypt is not guaranteed to protect confidentiality or authenticity
|
||||
if an attacker is able to manipulate the filesystem offline prior to
|
||||
an authorized user later accessing the filesystem.
|
||||
|
||||
Online attacks
|
||||
--------------
|
||||
|
||||
fscrypt (and storage encryption in general) can only provide limited
|
||||
protection, if any at all, against online attacks. In detail:
|
||||
|
||||
fscrypt is only resistant to side-channel attacks, such as timing or
|
||||
electromagnetic attacks, to the extent that the underlying Linux
|
||||
Cryptographic API algorithms are. If a vulnerable algorithm is used,
|
||||
such as a table-based implementation of AES, it may be possible for an
|
||||
attacker to mount a side channel attack against the online system.
|
||||
Side channel attacks may also be mounted against applications
|
||||
consuming decrypted data.
|
||||
|
||||
After an encryption key has been provided, fscrypt is not designed to
|
||||
hide the plaintext file contents or filenames from other users on the
|
||||
same system, regardless of the visibility of the keyring key.
|
||||
Instead, existing access control mechanisms such as file mode bits,
|
||||
POSIX ACLs, LSMs, or mount namespaces should be used for this purpose.
|
||||
Also note that as long as the encryption keys are *anywhere* in
|
||||
memory, an online attacker can necessarily compromise them by mounting
|
||||
a physical attack or by exploiting any kernel security vulnerability
|
||||
which provides an arbitrary memory read primitive.
|
||||
|
||||
While it is ostensibly possible to "evict" keys from the system,
|
||||
recently accessed encrypted files will remain accessible at least
|
||||
until the filesystem is unmounted or the VFS caches are dropped, e.g.
|
||||
using ``echo 2 > /proc/sys/vm/drop_caches``. Even after that, if the
|
||||
RAM is compromised before being powered off, it will likely still be
|
||||
possible to recover portions of the plaintext file contents, if not
|
||||
some of the encryption keys as well. (Since Linux v4.12, all
|
||||
in-kernel keys related to fscrypt are sanitized before being freed.
|
||||
However, userspace would need to do its part as well.)
|
||||
|
||||
Currently, fscrypt does not prevent a user from maliciously providing
|
||||
an incorrect key for another user's existing encrypted files. A
|
||||
protection against this is planned.
|
||||
|
||||
Key hierarchy
|
||||
=============
|
||||
|
||||
Master Keys
|
||||
-----------
|
||||
|
||||
Each encrypted directory tree is protected by a *master key*. Master
|
||||
keys can be up to 64 bytes long, and must be at least as long as the
|
||||
greater of the key length needed by the contents and filenames
|
||||
encryption modes being used. For example, if AES-256-XTS is used for
|
||||
contents encryption, the master key must be 64 bytes (512 bits). Note
|
||||
that the XTS mode is defined to require a key twice as long as that
|
||||
required by the underlying block cipher.
|
||||
|
||||
To "unlock" an encrypted directory tree, userspace must provide the
|
||||
appropriate master key. There can be any number of master keys, each
|
||||
of which protects any number of directory trees on any number of
|
||||
filesystems.
|
||||
|
||||
Userspace should generate master keys either using a cryptographically
|
||||
secure random number generator, or by using a KDF (Key Derivation
|
||||
Function). Note that whenever a KDF is used to "stretch" a
|
||||
lower-entropy secret such as a passphrase, it is critical that a KDF
|
||||
designed for this purpose be used, such as scrypt, PBKDF2, or Argon2.
|
||||
|
||||
Per-file keys
|
||||
-------------
|
||||
|
||||
Since each master key can protect many files, it is necessary to
|
||||
"tweak" the encryption of each file so that the same plaintext in two
|
||||
files doesn't map to the same ciphertext, or vice versa. In most
|
||||
cases, fscrypt does this by deriving per-file keys. When a new
|
||||
encrypted inode (regular file, directory, or symlink) is created,
|
||||
fscrypt randomly generates a 16-byte nonce and stores it in the
|
||||
inode's encryption xattr. Then, it uses a KDF (Key Derivation
|
||||
Function) to derive the file's key from the master key and nonce.
|
||||
|
||||
The Adiantum encryption mode (see `Encryption modes and usage`_) is
|
||||
special, since it accepts longer IVs and is suitable for both contents
|
||||
and filenames encryption. For it, a "direct key" option is offered
|
||||
where the file's nonce is included in the IVs and the master key is
|
||||
used for encryption directly. This improves performance; however,
|
||||
users must not use the same master key for any other encryption mode.
|
||||
|
||||
Below, the KDF and design considerations are described in more detail.
|
||||
|
||||
The current KDF works by encrypting the master key with AES-128-ECB,
|
||||
using the file's nonce as the AES key. The output is used as the
|
||||
derived key. If the output is longer than needed, then it is
|
||||
truncated to the needed length.
|
||||
|
||||
Note: this KDF meets the primary security requirement, which is to
|
||||
produce unique derived keys that preserve the entropy of the master
|
||||
key, assuming that the master key is already a good pseudorandom key.
|
||||
However, it is nonstandard and has some problems such as being
|
||||
reversible, so it is generally considered to be a mistake! It may be
|
||||
replaced with HKDF or another more standard KDF in the future.
|
||||
|
||||
Key derivation was chosen over key wrapping because wrapped keys would
|
||||
require larger xattrs which would be less likely to fit in-line in the
|
||||
filesystem's inode table, and there didn't appear to be any
|
||||
significant advantages to key wrapping. In particular, currently
|
||||
there is no requirement to support unlocking a file with multiple
|
||||
alternative master keys or to support rotating master keys. Instead,
|
||||
the master keys may be wrapped in userspace, e.g. as is done by the
|
||||
`fscrypt <https://github.com/google/fscrypt>`_ tool.
|
||||
|
||||
Including the inode number in the IVs was considered. However, it was
|
||||
rejected as it would have prevented ext4 filesystems from being
|
||||
resized, and by itself still wouldn't have been sufficient to prevent
|
||||
the same key from being directly reused for both XTS and CTS-CBC.
|
||||
|
||||
Encryption modes and usage
|
||||
==========================
|
||||
|
||||
fscrypt allows one encryption mode to be specified for file contents
|
||||
and one encryption mode to be specified for filenames. Different
|
||||
directory trees are permitted to use different encryption modes.
|
||||
Currently, the following pairs of encryption modes are supported:
|
||||
|
||||
- AES-256-XTS for contents and AES-256-CTS-CBC for filenames
|
||||
- AES-128-CBC for contents and AES-128-CTS-CBC for filenames
|
||||
- Adiantum for both contents and filenames
|
||||
|
||||
If unsure, you should use the (AES-256-XTS, AES-256-CTS-CBC) pair.
|
||||
|
||||
AES-128-CBC was added only for low-powered embedded devices with
|
||||
crypto accelerators such as CAAM or CESA that do not support XTS.
|
||||
|
||||
Adiantum is a (primarily) stream cipher-based mode that is fast even
|
||||
on CPUs without dedicated crypto instructions. It's also a true
|
||||
wide-block mode, unlike XTS. It can also eliminate the need to derive
|
||||
per-file keys. However, it depends on the security of two primitives,
|
||||
XChaCha12 and AES-256, rather than just one. See the paper
|
||||
"Adiantum: length-preserving encryption for entry-level processors"
|
||||
(https://eprint.iacr.org/2018/720.pdf) for more details. To use
|
||||
Adiantum, CONFIG_CRYPTO_ADIANTUM must be enabled. Also, fast
|
||||
implementations of ChaCha and NHPoly1305 should be enabled, e.g.
|
||||
CONFIG_CRYPTO_CHACHA20_NEON and CONFIG_CRYPTO_NHPOLY1305_NEON for ARM.
|
||||
|
||||
New encryption modes can be added relatively easily, without changes
|
||||
to individual filesystems. However, authenticated encryption (AE)
|
||||
modes are not currently supported because of the difficulty of dealing
|
||||
with ciphertext expansion.
|
||||
|
||||
Contents encryption
|
||||
-------------------
|
||||
|
||||
For file contents, each filesystem block is encrypted independently.
|
||||
Currently, only the case where the filesystem block size is equal to
|
||||
the system's page size (usually 4096 bytes) is supported.
|
||||
|
||||
Each block's IV is set to the logical block number within the file as
|
||||
a little endian number, except that:
|
||||
|
||||
- With CBC mode encryption, ESSIV is also used. Specifically, each IV
|
||||
is encrypted with AES-256 where the AES-256 key is the SHA-256 hash
|
||||
of the file's data encryption key.
|
||||
|
||||
- In the "direct key" configuration (FS_POLICY_FLAG_DIRECT_KEY set in
|
||||
the fscrypt_policy), the file's nonce is also appended to the IV.
|
||||
Currently this is only allowed with the Adiantum encryption mode.
|
||||
|
||||
Filenames encryption
|
||||
--------------------
|
||||
|
||||
For filenames, each full filename is encrypted at once. Because of
|
||||
the requirements to retain support for efficient directory lookups and
|
||||
filenames of up to 255 bytes, the same IV is used for every filename
|
||||
in a directory.
|
||||
|
||||
However, each encrypted directory still uses a unique key; or
|
||||
alternatively (for the "direct key" configuration) has the file's
|
||||
nonce included in the IVs. Thus, IV reuse is limited to within a
|
||||
single directory.
|
||||
|
||||
With CTS-CBC, the IV reuse means that when the plaintext filenames
|
||||
share a common prefix at least as long as the cipher block size (16
|
||||
bytes for AES), the corresponding encrypted filenames will also share
|
||||
a common prefix. This is undesirable. Adiantum does not have this
|
||||
weakness, as it is a wide-block encryption mode.
|
||||
|
||||
All supported filenames encryption modes accept any plaintext length
|
||||
>= 16 bytes; cipher block alignment is not required. However,
|
||||
filenames shorter than 16 bytes are NUL-padded to 16 bytes before
|
||||
being encrypted. In addition, to reduce leakage of filename lengths
|
||||
via their ciphertexts, all filenames are NUL-padded to the next 4, 8,
|
||||
16, or 32-byte boundary (configurable). 32 is recommended since this
|
||||
provides the best confidentiality, at the cost of making directory
|
||||
entries consume slightly more space. Note that since NUL (``\0``) is
|
||||
not otherwise a valid character in filenames, the padding will never
|
||||
produce duplicate plaintexts.
|
||||
|
||||
Symbolic link targets are considered a type of filename and are
|
||||
encrypted in the same way as filenames in directory entries, except
|
||||
that IV reuse is not a problem as each symlink has its own inode.
|
||||
|
||||
User API
|
||||
========
|
||||
|
||||
Setting an encryption policy
|
||||
----------------------------
|
||||
|
||||
The FS_IOC_SET_ENCRYPTION_POLICY ioctl sets an encryption policy on an
|
||||
empty directory or verifies that a directory or regular file already
|
||||
has the specified encryption policy. It takes in a pointer to a
|
||||
:c:type:`struct fscrypt_policy`, defined as follows::
|
||||
|
||||
#define FS_KEY_DESCRIPTOR_SIZE 8
|
||||
|
||||
struct fscrypt_policy {
|
||||
__u8 version;
|
||||
__u8 contents_encryption_mode;
|
||||
__u8 filenames_encryption_mode;
|
||||
__u8 flags;
|
||||
__u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
|
||||
};
|
||||
|
||||
This structure must be initialized as follows:
|
||||
|
||||
- ``version`` must be 0.
|
||||
|
||||
- ``contents_encryption_mode`` and ``filenames_encryption_mode`` must
|
||||
be set to constants from ``<linux/fs.h>`` which identify the
|
||||
encryption modes to use. If unsure, use
|
||||
FS_ENCRYPTION_MODE_AES_256_XTS (1) for ``contents_encryption_mode``
|
||||
and FS_ENCRYPTION_MODE_AES_256_CTS (4) for
|
||||
``filenames_encryption_mode``.
|
||||
|
||||
- ``flags`` must contain a value from ``<linux/fs.h>`` which
|
||||
identifies the amount of NUL-padding to use when encrypting
|
||||
filenames. If unsure, use FS_POLICY_FLAGS_PAD_32 (0x3).
|
||||
In addition, if the chosen encryption modes are both
|
||||
FS_ENCRYPTION_MODE_ADIANTUM, this can contain
|
||||
FS_POLICY_FLAG_DIRECT_KEY to specify that the master key should be
|
||||
used directly, without key derivation.
|
||||
|
||||
- ``master_key_descriptor`` specifies how to find the master key in
|
||||
the keyring; see `Adding keys`_. It is up to userspace to choose a
|
||||
unique ``master_key_descriptor`` for each master key. The e4crypt
|
||||
and fscrypt tools use the first 8 bytes of
|
||||
``SHA-512(SHA-512(master_key))``, but this particular scheme is not
|
||||
required. Also, the master key need not be in the keyring yet when
|
||||
FS_IOC_SET_ENCRYPTION_POLICY is executed. However, it must be added
|
||||
before any files can be created in the encrypted directory.
|
||||
|
||||
If the file is not yet encrypted, then FS_IOC_SET_ENCRYPTION_POLICY
|
||||
verifies that the file is an empty directory. If so, the specified
|
||||
encryption policy is assigned to the directory, turning it into an
|
||||
encrypted directory. After that, and after providing the
|
||||
corresponding master key as described in `Adding keys`_, all regular
|
||||
files, directories (recursively), and symlinks created in the
|
||||
directory will be encrypted, inheriting the same encryption policy.
|
||||
The filenames in the directory's entries will be encrypted as well.
|
||||
|
||||
Alternatively, if the file is already encrypted, then
|
||||
FS_IOC_SET_ENCRYPTION_POLICY validates that the specified encryption
|
||||
policy exactly matches the actual one. If they match, then the ioctl
|
||||
returns 0. Otherwise, it fails with EEXIST. This works on both
|
||||
regular files and directories, including nonempty directories.
|
||||
|
||||
Note that the ext4 filesystem does not allow the root directory to be
|
||||
encrypted, even if it is empty. Users who want to encrypt an entire
|
||||
filesystem with one key should consider using dm-crypt instead.
|
||||
|
||||
FS_IOC_SET_ENCRYPTION_POLICY can fail with the following errors:
|
||||
|
||||
- ``EACCES``: the file is not owned by the process's uid, nor does the
|
||||
process have the CAP_FOWNER capability in a namespace with the file
|
||||
owner's uid mapped
|
||||
- ``EEXIST``: the file is already encrypted with an encryption policy
|
||||
different from the one specified
|
||||
- ``EINVAL``: an invalid encryption policy was specified (invalid
|
||||
version, mode(s), or flags)
|
||||
- ``ENOTDIR``: the file is unencrypted and is a regular file, not a
|
||||
directory
|
||||
- ``ENOTEMPTY``: the file is unencrypted and is a nonempty directory
|
||||
- ``ENOTTY``: this type of filesystem does not implement encryption
|
||||
- ``EOPNOTSUPP``: the kernel was not configured with encryption
|
||||
support for this filesystem, or the filesystem superblock has not
|
||||
had encryption enabled on it. (For example, to use encryption on an
|
||||
ext4 filesystem, CONFIG_EXT4_ENCRYPTION must be enabled in the
|
||||
kernel config, and the superblock must have had the "encrypt"
|
||||
feature flag enabled using ``tune2fs -O encrypt`` or ``mkfs.ext4 -O
|
||||
encrypt``.)
|
||||
- ``EPERM``: this directory may not be encrypted, e.g. because it is
|
||||
the root directory of an ext4 filesystem
|
||||
- ``EROFS``: the filesystem is readonly
|
||||
|
||||
Getting an encryption policy
|
||||
----------------------------
|
||||
|
||||
The FS_IOC_GET_ENCRYPTION_POLICY ioctl retrieves the :c:type:`struct
|
||||
fscrypt_policy`, if any, for a directory or regular file. See above
|
||||
for the struct definition. No additional permissions are required
|
||||
beyond the ability to open the file.
|
||||
|
||||
FS_IOC_GET_ENCRYPTION_POLICY can fail with the following errors:
|
||||
|
||||
- ``EINVAL``: the file is encrypted, but it uses an unrecognized
|
||||
encryption context format
|
||||
- ``ENODATA``: the file is not encrypted
|
||||
- ``ENOTTY``: this type of filesystem does not implement encryption
|
||||
- ``EOPNOTSUPP``: the kernel was not configured with encryption
|
||||
support for this filesystem
|
||||
|
||||
Note: if you only need to know whether a file is encrypted or not, on
|
||||
most filesystems it is also possible to use the FS_IOC_GETFLAGS ioctl
|
||||
and check for FS_ENCRYPT_FL, or to use the statx() system call and
|
||||
check for STATX_ATTR_ENCRYPTED in stx_attributes.
|
||||
|
||||
Getting the per-filesystem salt
|
||||
-------------------------------
|
||||
|
||||
Some filesystems, such as ext4 and F2FS, also support the deprecated
|
||||
ioctl FS_IOC_GET_ENCRYPTION_PWSALT. This ioctl retrieves a randomly
|
||||
generated 16-byte value stored in the filesystem superblock. This
|
||||
value is intended to used as a salt when deriving an encryption key
|
||||
from a passphrase or other low-entropy user credential.
|
||||
|
||||
FS_IOC_GET_ENCRYPTION_PWSALT is deprecated. Instead, prefer to
|
||||
generate and manage any needed salt(s) in userspace.
|
||||
|
||||
Adding keys
|
||||
-----------
|
||||
|
||||
To provide a master key, userspace must add it to an appropriate
|
||||
keyring using the add_key() system call (see:
|
||||
``Documentation/security/keys/core.rst``). The key type must be
|
||||
"logon"; keys of this type are kept in kernel memory and cannot be
|
||||
read back by userspace. The key description must be "fscrypt:"
|
||||
followed by the 16-character lower case hex representation of the
|
||||
``master_key_descriptor`` that was set in the encryption policy. The
|
||||
key payload must conform to the following structure::
|
||||
|
||||
#define FS_MAX_KEY_SIZE 64
|
||||
|
||||
struct fscrypt_key {
|
||||
u32 mode;
|
||||
u8 raw[FS_MAX_KEY_SIZE];
|
||||
u32 size;
|
||||
};
|
||||
|
||||
``mode`` is ignored; just set it to 0. The actual key is provided in
|
||||
``raw`` with ``size`` indicating its size in bytes. That is, the
|
||||
bytes ``raw[0..size-1]`` (inclusive) are the actual key.
|
||||
|
||||
The key description prefix "fscrypt:" may alternatively be replaced
|
||||
with a filesystem-specific prefix such as "ext4:". However, the
|
||||
filesystem-specific prefixes are deprecated and should not be used in
|
||||
new programs.
|
||||
|
||||
There are several different types of keyrings in which encryption keys
|
||||
may be placed, such as a session keyring, a user session keyring, or a
|
||||
user keyring. Each key must be placed in a keyring that is "attached"
|
||||
to all processes that might need to access files encrypted with it, in
|
||||
the sense that request_key() will find the key. Generally, if only
|
||||
processes belonging to a specific user need to access a given
|
||||
encrypted directory and no session keyring has been installed, then
|
||||
that directory's key should be placed in that user's user session
|
||||
keyring or user keyring. Otherwise, a session keyring should be
|
||||
installed if needed, and the key should be linked into that session
|
||||
keyring, or in a keyring linked into that session keyring.
|
||||
|
||||
Note: introducing the complex visibility semantics of keyrings here
|
||||
was arguably a mistake --- especially given that by design, after any
|
||||
process successfully opens an encrypted file (thereby setting up the
|
||||
per-file key), possessing the keyring key is not actually required for
|
||||
any process to read/write the file until its in-memory inode is
|
||||
evicted. In the future there probably should be a way to provide keys
|
||||
directly to the filesystem instead, which would make the intended
|
||||
semantics clearer.
|
||||
|
||||
Access semantics
|
||||
================
|
||||
|
||||
With the key
|
||||
------------
|
||||
|
||||
With the encryption key, encrypted regular files, directories, and
|
||||
symlinks behave very similarly to their unencrypted counterparts ---
|
||||
after all, the encryption is intended to be transparent. However,
|
||||
astute users may notice some differences in behavior:
|
||||
|
||||
- Unencrypted files, or files encrypted with a different encryption
|
||||
policy (i.e. different key, modes, or flags), cannot be renamed or
|
||||
linked into an encrypted directory; see `Encryption policy
|
||||
enforcement`_. Attempts to do so will fail with EPERM. However,
|
||||
encrypted files can be renamed within an encrypted directory, or
|
||||
into an unencrypted directory.
|
||||
|
||||
- Direct I/O is not supported on encrypted files. Attempts to use
|
||||
direct I/O on such files will fall back to buffered I/O.
|
||||
|
||||
- The fallocate operations FALLOC_FL_COLLAPSE_RANGE,
|
||||
FALLOC_FL_INSERT_RANGE, and FALLOC_FL_ZERO_RANGE are not supported
|
||||
on encrypted files and will fail with EOPNOTSUPP.
|
||||
|
||||
- Online defragmentation of encrypted files is not supported. The
|
||||
EXT4_IOC_MOVE_EXT and F2FS_IOC_MOVE_RANGE ioctls will fail with
|
||||
EOPNOTSUPP.
|
||||
|
||||
- The ext4 filesystem does not support data journaling with encrypted
|
||||
regular files. It will fall back to ordered data mode instead.
|
||||
|
||||
- DAX (Direct Access) is not supported on encrypted files.
|
||||
|
||||
- The st_size of an encrypted symlink will not necessarily give the
|
||||
length of the symlink target as required by POSIX. It will actually
|
||||
give the length of the ciphertext, which will be slightly longer
|
||||
than the plaintext due to NUL-padding and an extra 2-byte overhead.
|
||||
|
||||
- The maximum length of an encrypted symlink is 2 bytes shorter than
|
||||
the maximum length of an unencrypted symlink. For example, on an
|
||||
EXT4 filesystem with a 4K block size, unencrypted symlinks can be up
|
||||
to 4095 bytes long, while encrypted symlinks can only be up to 4093
|
||||
bytes long (both lengths excluding the terminating null).
|
||||
|
||||
Note that mmap *is* supported. This is possible because the pagecache
|
||||
for an encrypted file contains the plaintext, not the ciphertext.
|
||||
|
||||
Without the key
|
||||
---------------
|
||||
|
||||
Some filesystem operations may be performed on encrypted regular
|
||||
files, directories, and symlinks even before their encryption key has
|
||||
been provided:
|
||||
|
||||
- File metadata may be read, e.g. using stat().
|
||||
|
||||
- Directories may be listed, in which case the filenames will be
|
||||
listed in an encoded form derived from their ciphertext. The
|
||||
current encoding algorithm is described in `Filename hashing and
|
||||
encoding`_. The algorithm is subject to change, but it is
|
||||
guaranteed that the presented filenames will be no longer than
|
||||
NAME_MAX bytes, will not contain the ``/`` or ``\0`` characters, and
|
||||
will uniquely identify directory entries.
|
||||
|
||||
The ``.`` and ``..`` directory entries are special. They are always
|
||||
present and are not encrypted or encoded.
|
||||
|
||||
- Files may be deleted. That is, nondirectory files may be deleted
|
||||
with unlink() as usual, and empty directories may be deleted with
|
||||
rmdir() as usual. Therefore, ``rm`` and ``rm -r`` will work as
|
||||
expected.
|
||||
|
||||
- Symlink targets may be read and followed, but they will be presented
|
||||
in encrypted form, similar to filenames in directories. Hence, they
|
||||
are unlikely to point to anywhere useful.
|
||||
|
||||
Without the key, regular files cannot be opened or truncated.
|
||||
Attempts to do so will fail with ENOKEY. This implies that any
|
||||
regular file operations that require a file descriptor, such as
|
||||
read(), write(), mmap(), fallocate(), and ioctl(), are also forbidden.
|
||||
|
||||
Also without the key, files of any type (including directories) cannot
|
||||
be created or linked into an encrypted directory, nor can a name in an
|
||||
encrypted directory be the source or target of a rename, nor can an
|
||||
O_TMPFILE temporary file be created in an encrypted directory. All
|
||||
such operations will fail with ENOKEY.
|
||||
|
||||
It is not currently possible to backup and restore encrypted files
|
||||
without the encryption key. This would require special APIs which
|
||||
have not yet been implemented.
|
||||
|
||||
Encryption policy enforcement
|
||||
=============================
|
||||
|
||||
After an encryption policy has been set on a directory, all regular
|
||||
files, directories, and symbolic links created in that directory
|
||||
(recursively) will inherit that encryption policy. Special files ---
|
||||
that is, named pipes, device nodes, and UNIX domain sockets --- will
|
||||
not be encrypted.
|
||||
|
||||
Except for those special files, it is forbidden to have unencrypted
|
||||
files, or files encrypted with a different encryption policy, in an
|
||||
encrypted directory tree. Attempts to link or rename such a file into
|
||||
an encrypted directory will fail with EPERM. This is also enforced
|
||||
during ->lookup() to provide limited protection against offline
|
||||
attacks that try to disable or downgrade encryption in known locations
|
||||
where applications may later write sensitive data. It is recommended
|
||||
that systems implementing a form of "verified boot" take advantage of
|
||||
this by validating all top-level encryption policies prior to access.
|
||||
|
||||
Implementation details
|
||||
======================
|
||||
|
||||
Encryption context
|
||||
------------------
|
||||
|
||||
An encryption policy is represented on-disk by a :c:type:`struct
|
||||
fscrypt_context`. It is up to individual filesystems to decide where
|
||||
to store it, but normally it would be stored in a hidden extended
|
||||
attribute. It should *not* be exposed by the xattr-related system
|
||||
calls such as getxattr() and setxattr() because of the special
|
||||
semantics of the encryption xattr. (In particular, there would be
|
||||
much confusion if an encryption policy were to be added to or removed
|
||||
from anything other than an empty directory.) The struct is defined
|
||||
as follows::
|
||||
|
||||
#define FS_KEY_DESCRIPTOR_SIZE 8
|
||||
#define FS_KEY_DERIVATION_NONCE_SIZE 16
|
||||
|
||||
struct fscrypt_context {
|
||||
u8 format;
|
||||
u8 contents_encryption_mode;
|
||||
u8 filenames_encryption_mode;
|
||||
u8 flags;
|
||||
u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
|
||||
u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
|
||||
};
|
||||
|
||||
Note that :c:type:`struct fscrypt_context` contains the same
|
||||
information as :c:type:`struct fscrypt_policy` (see `Setting an
|
||||
encryption policy`_), except that :c:type:`struct fscrypt_context`
|
||||
also contains a nonce. The nonce is randomly generated by the kernel
|
||||
and is used to derive the inode's encryption key as described in
|
||||
`Per-file keys`_.
|
||||
|
||||
Data path changes
|
||||
-----------------
|
||||
|
||||
For the read path (->readpage()) of regular files, filesystems can
|
||||
read the ciphertext into the page cache and decrypt it in-place. The
|
||||
page lock must be held until decryption has finished, to prevent the
|
||||
page from becoming visible to userspace prematurely.
|
||||
|
||||
For the write path (->writepage()) of regular files, filesystems
|
||||
cannot encrypt data in-place in the page cache, since the cached
|
||||
plaintext must be preserved. Instead, filesystems must encrypt into a
|
||||
temporary buffer or "bounce page", then write out the temporary
|
||||
buffer. Some filesystems, such as UBIFS, already use temporary
|
||||
buffers regardless of encryption. Other filesystems, such as ext4 and
|
||||
F2FS, have to allocate bounce pages specially for encryption.
|
||||
|
||||
Filename hashing and encoding
|
||||
-----------------------------
|
||||
|
||||
Modern filesystems accelerate directory lookups by using indexed
|
||||
directories. An indexed directory is organized as a tree keyed by
|
||||
filename hashes. When a ->lookup() is requested, the filesystem
|
||||
normally hashes the filename being looked up so that it can quickly
|
||||
find the corresponding directory entry, if any.
|
||||
|
||||
With encryption, lookups must be supported and efficient both with and
|
||||
without the encryption key. Clearly, it would not work to hash the
|
||||
plaintext filenames, since the plaintext filenames are unavailable
|
||||
without the key. (Hashing the plaintext filenames would also make it
|
||||
impossible for the filesystem's fsck tool to optimize encrypted
|
||||
directories.) Instead, filesystems hash the ciphertext filenames,
|
||||
i.e. the bytes actually stored on-disk in the directory entries. When
|
||||
asked to do a ->lookup() with the key, the filesystem just encrypts
|
||||
the user-supplied name to get the ciphertext.
|
||||
|
||||
Lookups without the key are more complicated. The raw ciphertext may
|
||||
contain the ``\0`` and ``/`` characters, which are illegal in
|
||||
filenames. Therefore, readdir() must base64-encode the ciphertext for
|
||||
presentation. For most filenames, this works fine; on ->lookup(), the
|
||||
filesystem just base64-decodes the user-supplied name to get back to
|
||||
the raw ciphertext.
|
||||
|
||||
However, for very long filenames, base64 encoding would cause the
|
||||
filename length to exceed NAME_MAX. To prevent this, readdir()
|
||||
actually presents long filenames in an abbreviated form which encodes
|
||||
a strong "hash" of the ciphertext filename, along with the optional
|
||||
filesystem-specific hash(es) needed for directory lookups. This
|
||||
allows the filesystem to still, with a high degree of confidence, map
|
||||
the filename given in ->lookup() back to a particular directory entry
|
||||
that was previously listed by readdir(). See :c:type:`struct
|
||||
fscrypt_digested_name` in the source for more details.
|
||||
|
||||
Note that the precise way that filenames are presented to userspace
|
||||
without the key is subject to change in the future. It is only meant
|
||||
as a way to temporarily present valid filenames so that commands like
|
||||
``rm -r`` work as expected on encrypted directories.
|
|
@ -82,6 +82,29 @@ Only the lists of names from directories are merged. Other content
|
|||
such as metadata and extended attributes are reported for the upper
|
||||
directory only. These attributes of the lower directory are hidden.
|
||||
|
||||
credentials
|
||||
-----------
|
||||
|
||||
By default, all access to the upper, lower and work directories is the
|
||||
recorded mounter's MAC and DAC credentials. The incoming accesses are
|
||||
checked against the caller's credentials.
|
||||
|
||||
In the case where caller MAC or DAC credentials do not overlap, a
|
||||
use case available in older versions of the driver, the
|
||||
override_creds mount flag can be turned off and help when the use
|
||||
pattern has caller with legitimate credentials where the mounter
|
||||
does not. Several unintended side effects will occur though. The
|
||||
caller without certain key capabilities or lower privilege will not
|
||||
always be able to delete files or directories, create nodes, or
|
||||
search some restricted directories. The ability to search and read
|
||||
a directory entry is spotty as a result of the cache mechanism not
|
||||
retesting the credentials because of the assumption, a privileged
|
||||
caller can fill cache, then a lower privilege can read the directory
|
||||
cache. The uneven security model where cache, upperdir and workdir
|
||||
are opened at privilege, but accessed without creating a form of
|
||||
privilege escalation, should only be used with strict understanding
|
||||
of the side effects and of the security policies.
|
||||
|
||||
whiteouts and opaque directories
|
||||
--------------------------------
|
||||
|
||||
|
|
|
@ -487,7 +487,9 @@ manner. The codes are the following:
|
|||
|
||||
Note that there is no guarantee that every flag and associated mnemonic will
|
||||
be present in all further kernel releases. Things get changed, the flags may
|
||||
be vanished or the reverse -- new added.
|
||||
be vanished or the reverse -- new added. Interpretation of their meaning
|
||||
might change in future as well. So each consumer of these flags has to
|
||||
follow each specific kernel version for the exact semantic.
|
||||
|
||||
The "Name" field will only be present on a mapping that has been named by
|
||||
userspace, and will show the name passed in by userspace.
|
||||
|
|
|
@ -160,7 +160,7 @@ them but you should handle them according to your needs.
|
|||
UHID_OUTPUT:
|
||||
This is sent if the HID device driver wants to send raw data to the I/O
|
||||
device on the interrupt channel. You should read the payload and forward it to
|
||||
the device. The payload is of type "struct uhid_data_req".
|
||||
the device. The payload is of type "struct uhid_output_req".
|
||||
This may be received even though you haven't received UHID_OPEN, yet.
|
||||
|
||||
UHID_GET_REPORT:
|
||||
|
|
15
Documentation/hw-vuln/index.rst
Normal file
15
Documentation/hw-vuln/index.rst
Normal file
|
@ -0,0 +1,15 @@
|
|||
========================
|
||||
Hardware vulnerabilities
|
||||
========================
|
||||
|
||||
This section describes CPU vulnerabilities and provides an overview of the
|
||||
possible mitigations along with guidance for selecting mitigations if they
|
||||
are configurable at compile, boot or run time.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
l1tf
|
||||
mds
|
||||
tsx_async_abort
|
||||
multihit.rst
|
615
Documentation/hw-vuln/l1tf.rst
Normal file
615
Documentation/hw-vuln/l1tf.rst
Normal file
|
@ -0,0 +1,615 @@
|
|||
L1TF - L1 Terminal Fault
|
||||
========================
|
||||
|
||||
L1 Terminal Fault is a hardware vulnerability which allows unprivileged
|
||||
speculative access to data which is available in the Level 1 Data Cache
|
||||
when the page table entry controlling the virtual address, which is used
|
||||
for the access, has the Present bit cleared or other reserved bits set.
|
||||
|
||||
Affected processors
|
||||
-------------------
|
||||
|
||||
This vulnerability affects a wide range of Intel processors. The
|
||||
vulnerability is not present on:
|
||||
|
||||
- Processors from AMD, Centaur and other non Intel vendors
|
||||
|
||||
- Older processor models, where the CPU family is < 6
|
||||
|
||||
- A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft,
|
||||
Penwell, Pineview, Silvermont, Airmont, Merrifield)
|
||||
|
||||
- The Intel XEON PHI family
|
||||
|
||||
- Intel processors which have the ARCH_CAP_RDCL_NO bit set in the
|
||||
IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected
|
||||
by the Meltdown vulnerability either. These CPUs should become
|
||||
available by end of 2018.
|
||||
|
||||
Whether a processor is affected or not can be read out from the L1TF
|
||||
vulnerability file in sysfs. See :ref:`l1tf_sys_info`.
|
||||
|
||||
Related CVEs
|
||||
------------
|
||||
|
||||
The following CVE entries are related to the L1TF vulnerability:
|
||||
|
||||
============= ================= ==============================
|
||||
CVE-2018-3615 L1 Terminal Fault SGX related aspects
|
||||
CVE-2018-3620 L1 Terminal Fault OS, SMM related aspects
|
||||
CVE-2018-3646 L1 Terminal Fault Virtualization related aspects
|
||||
============= ================= ==============================
|
||||
|
||||
Problem
|
||||
-------
|
||||
|
||||
If an instruction accesses a virtual address for which the relevant page
|
||||
table entry (PTE) has the Present bit cleared or other reserved bits set,
|
||||
then speculative execution ignores the invalid PTE and loads the referenced
|
||||
data if it is present in the Level 1 Data Cache, as if the page referenced
|
||||
by the address bits in the PTE was still present and accessible.
|
||||
|
||||
While this is a purely speculative mechanism and the instruction will raise
|
||||
a page fault when it is retired eventually, the pure act of loading the
|
||||
data and making it available to other speculative instructions opens up the
|
||||
opportunity for side channel attacks to unprivileged malicious code,
|
||||
similar to the Meltdown attack.
|
||||
|
||||
While Meltdown breaks the user space to kernel space protection, L1TF
|
||||
allows to attack any physical memory address in the system and the attack
|
||||
works across all protection domains. It allows an attack of SGX and also
|
||||
works from inside virtual machines because the speculation bypasses the
|
||||
extended page table (EPT) protection mechanism.
|
||||
|
||||
|
||||
Attack scenarios
|
||||
----------------
|
||||
|
||||
1. Malicious user space
|
||||
^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Operating Systems store arbitrary information in the address bits of a
|
||||
PTE which is marked non present. This allows a malicious user space
|
||||
application to attack the physical memory to which these PTEs resolve.
|
||||
In some cases user-space can maliciously influence the information
|
||||
encoded in the address bits of the PTE, thus making attacks more
|
||||
deterministic and more practical.
|
||||
|
||||
The Linux kernel contains a mitigation for this attack vector, PTE
|
||||
inversion, which is permanently enabled and has no performance
|
||||
impact. The kernel ensures that the address bits of PTEs, which are not
|
||||
marked present, never point to cacheable physical memory space.
|
||||
|
||||
A system with an up to date kernel is protected against attacks from
|
||||
malicious user space applications.
|
||||
|
||||
2. Malicious guest in a virtual machine
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The fact that L1TF breaks all domain protections allows malicious guest
|
||||
OSes, which can control the PTEs directly, and malicious guest user
|
||||
space applications, which run on an unprotected guest kernel lacking the
|
||||
PTE inversion mitigation for L1TF, to attack physical host memory.
|
||||
|
||||
A special aspect of L1TF in the context of virtualization is symmetric
|
||||
multi threading (SMT). The Intel implementation of SMT is called
|
||||
HyperThreading. The fact that Hyperthreads on the affected processors
|
||||
share the L1 Data Cache (L1D) is important for this. As the flaw allows
|
||||
only to attack data which is present in L1D, a malicious guest running
|
||||
on one Hyperthread can attack the data which is brought into the L1D by
|
||||
the context which runs on the sibling Hyperthread of the same physical
|
||||
core. This context can be host OS, host user space or a different guest.
|
||||
|
||||
If the processor does not support Extended Page Tables, the attack is
|
||||
only possible, when the hypervisor does not sanitize the content of the
|
||||
effective (shadow) page tables.
|
||||
|
||||
While solutions exist to mitigate these attack vectors fully, these
|
||||
mitigations are not enabled by default in the Linux kernel because they
|
||||
can affect performance significantly. The kernel provides several
|
||||
mechanisms which can be utilized to address the problem depending on the
|
||||
deployment scenario. The mitigations, their protection scope and impact
|
||||
are described in the next sections.
|
||||
|
||||
The default mitigations and the rationale for choosing them are explained
|
||||
at the end of this document. See :ref:`default_mitigations`.
|
||||
|
||||
.. _l1tf_sys_info:
|
||||
|
||||
L1TF system information
|
||||
-----------------------
|
||||
|
||||
The Linux kernel provides a sysfs interface to enumerate the current L1TF
|
||||
status of the system: whether the system is vulnerable, and which
|
||||
mitigations are active. The relevant sysfs file is:
|
||||
|
||||
/sys/devices/system/cpu/vulnerabilities/l1tf
|
||||
|
||||
The possible values in this file are:
|
||||
|
||||
=========================== ===============================
|
||||
'Not affected' The processor is not vulnerable
|
||||
'Mitigation: PTE Inversion' The host protection is active
|
||||
=========================== ===============================
|
||||
|
||||
If KVM/VMX is enabled and the processor is vulnerable then the following
|
||||
information is appended to the 'Mitigation: PTE Inversion' part:
|
||||
|
||||
- SMT status:
|
||||
|
||||
===================== ================
|
||||
'VMX: SMT vulnerable' SMT is enabled
|
||||
'VMX: SMT disabled' SMT is disabled
|
||||
===================== ================
|
||||
|
||||
- L1D Flush mode:
|
||||
|
||||
================================ ====================================
|
||||
'L1D vulnerable' L1D flushing is disabled
|
||||
|
||||
'L1D conditional cache flushes' L1D flush is conditionally enabled
|
||||
|
||||
'L1D cache flushes' L1D flush is unconditionally enabled
|
||||
================================ ====================================
|
||||
|
||||
The resulting grade of protection is discussed in the following sections.
|
||||
|
||||
|
||||
Host mitigation mechanism
|
||||
-------------------------
|
||||
|
||||
The kernel is unconditionally protected against L1TF attacks from malicious
|
||||
user space running on the host.
|
||||
|
||||
|
||||
Guest mitigation mechanisms
|
||||
---------------------------
|
||||
|
||||
.. _l1d_flush:
|
||||
|
||||
1. L1D flush on VMENTER
|
||||
^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
To make sure that a guest cannot attack data which is present in the L1D
|
||||
the hypervisor flushes the L1D before entering the guest.
|
||||
|
||||
Flushing the L1D evicts not only the data which should not be accessed
|
||||
by a potentially malicious guest, it also flushes the guest
|
||||
data. Flushing the L1D has a performance impact as the processor has to
|
||||
bring the flushed guest data back into the L1D. Depending on the
|
||||
frequency of VMEXIT/VMENTER and the type of computations in the guest
|
||||
performance degradation in the range of 1% to 50% has been observed. For
|
||||
scenarios where guest VMEXIT/VMENTER are rare the performance impact is
|
||||
minimal. Virtio and mechanisms like posted interrupts are designed to
|
||||
confine the VMEXITs to a bare minimum, but specific configurations and
|
||||
application scenarios might still suffer from a high VMEXIT rate.
|
||||
|
||||
The kernel provides two L1D flush modes:
|
||||
- conditional ('cond')
|
||||
- unconditional ('always')
|
||||
|
||||
The conditional mode avoids L1D flushing after VMEXITs which execute
|
||||
only audited code paths before the corresponding VMENTER. These code
|
||||
paths have been verified that they cannot expose secrets or other
|
||||
interesting data to an attacker, but they can leak information about the
|
||||
address space layout of the hypervisor.
|
||||
|
||||
Unconditional mode flushes L1D on all VMENTER invocations and provides
|
||||
maximum protection. It has a higher overhead than the conditional
|
||||
mode. The overhead cannot be quantified correctly as it depends on the
|
||||
workload scenario and the resulting number of VMEXITs.
|
||||
|
||||
The general recommendation is to enable L1D flush on VMENTER. The kernel
|
||||
defaults to conditional mode on affected processors.
|
||||
|
||||
**Note**, that L1D flush does not prevent the SMT problem because the
|
||||
sibling thread will also bring back its data into the L1D which makes it
|
||||
attackable again.
|
||||
|
||||
L1D flush can be controlled by the administrator via the kernel command
|
||||
line and sysfs control files. See :ref:`mitigation_control_command_line`
|
||||
and :ref:`mitigation_control_kvm`.
|
||||
|
||||
.. _guest_confinement:
|
||||
|
||||
2. Guest VCPU confinement to dedicated physical cores
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
To address the SMT problem, it is possible to make a guest or a group of
|
||||
guests affine to one or more physical cores. The proper mechanism for
|
||||
that is to utilize exclusive cpusets to ensure that no other guest or
|
||||
host tasks can run on these cores.
|
||||
|
||||
If only a single guest or related guests run on sibling SMT threads on
|
||||
the same physical core then they can only attack their own memory and
|
||||
restricted parts of the host memory.
|
||||
|
||||
Host memory is attackable, when one of the sibling SMT threads runs in
|
||||
host OS (hypervisor) context and the other in guest context. The amount
|
||||
of valuable information from the host OS context depends on the context
|
||||
which the host OS executes, i.e. interrupts, soft interrupts and kernel
|
||||
threads. The amount of valuable data from these contexts cannot be
|
||||
declared as non-interesting for an attacker without deep inspection of
|
||||
the code.
|
||||
|
||||
**Note**, that assigning guests to a fixed set of physical cores affects
|
||||
the ability of the scheduler to do load balancing and might have
|
||||
negative effects on CPU utilization depending on the hosting
|
||||
scenario. Disabling SMT might be a viable alternative for particular
|
||||
scenarios.
|
||||
|
||||
For further information about confining guests to a single or to a group
|
||||
of cores consult the cpusets documentation:
|
||||
|
||||
https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt
|
||||
|
||||
.. _interrupt_isolation:
|
||||
|
||||
3. Interrupt affinity
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Interrupts can be made affine to logical CPUs. This is not universally
|
||||
true because there are types of interrupts which are truly per CPU
|
||||
interrupts, e.g. the local timer interrupt. Aside of that multi queue
|
||||
devices affine their interrupts to single CPUs or groups of CPUs per
|
||||
queue without allowing the administrator to control the affinities.
|
||||
|
||||
Moving the interrupts, which can be affinity controlled, away from CPUs
|
||||
which run untrusted guests, reduces the attack vector space.
|
||||
|
||||
Whether the interrupts with are affine to CPUs, which run untrusted
|
||||
guests, provide interesting data for an attacker depends on the system
|
||||
configuration and the scenarios which run on the system. While for some
|
||||
of the interrupts it can be assumed that they won't expose interesting
|
||||
information beyond exposing hints about the host OS memory layout, there
|
||||
is no way to make general assumptions.
|
||||
|
||||
Interrupt affinity can be controlled by the administrator via the
|
||||
/proc/irq/$NR/smp_affinity[_list] files. Limited documentation is
|
||||
available at:
|
||||
|
||||
https://www.kernel.org/doc/Documentation/IRQ-affinity.txt
|
||||
|
||||
.. _smt_control:
|
||||
|
||||
4. SMT control
|
||||
^^^^^^^^^^^^^^
|
||||
|
||||
To prevent the SMT issues of L1TF it might be necessary to disable SMT
|
||||
completely. Disabling SMT can have a significant performance impact, but
|
||||
the impact depends on the hosting scenario and the type of workloads.
|
||||
The impact of disabling SMT needs also to be weighted against the impact
|
||||
of other mitigation solutions like confining guests to dedicated cores.
|
||||
|
||||
The kernel provides a sysfs interface to retrieve the status of SMT and
|
||||
to control it. It also provides a kernel command line interface to
|
||||
control SMT.
|
||||
|
||||
The kernel command line interface consists of the following options:
|
||||
|
||||
=========== ==========================================================
|
||||
nosmt Affects the bring up of the secondary CPUs during boot. The
|
||||
kernel tries to bring all present CPUs online during the
|
||||
boot process. "nosmt" makes sure that from each physical
|
||||
core only one - the so called primary (hyper) thread is
|
||||
activated. Due to a design flaw of Intel processors related
|
||||
to Machine Check Exceptions the non primary siblings have
|
||||
to be brought up at least partially and are then shut down
|
||||
again. "nosmt" can be undone via the sysfs interface.
|
||||
|
||||
nosmt=force Has the same effect as "nosmt" but it does not allow to
|
||||
undo the SMT disable via the sysfs interface.
|
||||
=========== ==========================================================
|
||||
|
||||
The sysfs interface provides two files:
|
||||
|
||||
- /sys/devices/system/cpu/smt/control
|
||||
- /sys/devices/system/cpu/smt/active
|
||||
|
||||
/sys/devices/system/cpu/smt/control:
|
||||
|
||||
This file allows to read out the SMT control state and provides the
|
||||
ability to disable or (re)enable SMT. The possible states are:
|
||||
|
||||
============== ===================================================
|
||||
on SMT is supported by the CPU and enabled. All
|
||||
logical CPUs can be onlined and offlined without
|
||||
restrictions.
|
||||
|
||||
off SMT is supported by the CPU and disabled. Only
|
||||
the so called primary SMT threads can be onlined
|
||||
and offlined without restrictions. An attempt to
|
||||
online a non-primary sibling is rejected
|
||||
|
||||
forceoff Same as 'off' but the state cannot be controlled.
|
||||
Attempts to write to the control file are rejected.
|
||||
|
||||
notsupported The processor does not support SMT. It's therefore
|
||||
not affected by the SMT implications of L1TF.
|
||||
Attempts to write to the control file are rejected.
|
||||
============== ===================================================
|
||||
|
||||
The possible states which can be written into this file to control SMT
|
||||
state are:
|
||||
|
||||
- on
|
||||
- off
|
||||
- forceoff
|
||||
|
||||
/sys/devices/system/cpu/smt/active:
|
||||
|
||||
This file reports whether SMT is enabled and active, i.e. if on any
|
||||
physical core two or more sibling threads are online.
|
||||
|
||||
SMT control is also possible at boot time via the l1tf kernel command
|
||||
line parameter in combination with L1D flush control. See
|
||||
:ref:`mitigation_control_command_line`.
|
||||
|
||||
5. Disabling EPT
|
||||
^^^^^^^^^^^^^^^^
|
||||
|
||||
Disabling EPT for virtual machines provides full mitigation for L1TF even
|
||||
with SMT enabled, because the effective page tables for guests are
|
||||
managed and sanitized by the hypervisor. Though disabling EPT has a
|
||||
significant performance impact especially when the Meltdown mitigation
|
||||
KPTI is enabled.
|
||||
|
||||
EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
|
||||
|
||||
There is ongoing research and development for new mitigation mechanisms to
|
||||
address the performance impact of disabling SMT or EPT.
|
||||
|
||||
.. _mitigation_control_command_line:
|
||||
|
||||
Mitigation control on the kernel command line
|
||||
---------------------------------------------
|
||||
|
||||
The kernel command line allows to control the L1TF mitigations at boot
|
||||
time with the option "l1tf=". The valid arguments for this option are:
|
||||
|
||||
============ =============================================================
|
||||
full Provides all available mitigations for the L1TF
|
||||
vulnerability. Disables SMT and enables all mitigations in
|
||||
the hypervisors, i.e. unconditional L1D flushing
|
||||
|
||||
SMT control and L1D flush control via the sysfs interface
|
||||
is still possible after boot. Hypervisors will issue a
|
||||
warning when the first VM is started in a potentially
|
||||
insecure configuration, i.e. SMT enabled or L1D flush
|
||||
disabled.
|
||||
|
||||
full,force Same as 'full', but disables SMT and L1D flush runtime
|
||||
control. Implies the 'nosmt=force' command line option.
|
||||
(i.e. sysfs control of SMT is disabled.)
|
||||
|
||||
flush Leaves SMT enabled and enables the default hypervisor
|
||||
mitigation, i.e. conditional L1D flushing
|
||||
|
||||
SMT control and L1D flush control via the sysfs interface
|
||||
is still possible after boot. Hypervisors will issue a
|
||||
warning when the first VM is started in a potentially
|
||||
insecure configuration, i.e. SMT enabled or L1D flush
|
||||
disabled.
|
||||
|
||||
flush,nosmt Disables SMT and enables the default hypervisor mitigation,
|
||||
i.e. conditional L1D flushing.
|
||||
|
||||
SMT control and L1D flush control via the sysfs interface
|
||||
is still possible after boot. Hypervisors will issue a
|
||||
warning when the first VM is started in a potentially
|
||||
insecure configuration, i.e. SMT enabled or L1D flush
|
||||
disabled.
|
||||
|
||||
flush,nowarn Same as 'flush', but hypervisors will not warn when a VM is
|
||||
started in a potentially insecure configuration.
|
||||
|
||||
off Disables hypervisor mitigations and doesn't emit any
|
||||
warnings.
|
||||
It also drops the swap size and available RAM limit restrictions
|
||||
on both hypervisor and bare metal.
|
||||
|
||||
============ =============================================================
|
||||
|
||||
The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`.
|
||||
|
||||
|
||||
.. _mitigation_control_kvm:
|
||||
|
||||
Mitigation control for KVM - module parameter
|
||||
-------------------------------------------------------------
|
||||
|
||||
The KVM hypervisor mitigation mechanism, flushing the L1D cache when
|
||||
entering a guest, can be controlled with a module parameter.
|
||||
|
||||
The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the
|
||||
following arguments:
|
||||
|
||||
============ ==============================================================
|
||||
always L1D cache flush on every VMENTER.
|
||||
|
||||
cond Flush L1D on VMENTER only when the code between VMEXIT and
|
||||
VMENTER can leak host memory which is considered
|
||||
interesting for an attacker. This still can leak host memory
|
||||
which allows e.g. to determine the hosts address space layout.
|
||||
|
||||
never Disables the mitigation
|
||||
============ ==============================================================
|
||||
|
||||
The parameter can be provided on the kernel command line, as a module
|
||||
parameter when loading the modules and at runtime modified via the sysfs
|
||||
file:
|
||||
|
||||
/sys/module/kvm_intel/parameters/vmentry_l1d_flush
|
||||
|
||||
The default is 'cond'. If 'l1tf=full,force' is given on the kernel command
|
||||
line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
|
||||
module parameter is ignored and writes to the sysfs file are rejected.
|
||||
|
||||
.. _mitigation_selection:
|
||||
|
||||
Mitigation selection guide
|
||||
--------------------------
|
||||
|
||||
1. No virtualization in use
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The system is protected by the kernel unconditionally and no further
|
||||
action is required.
|
||||
|
||||
2. Virtualization with trusted guests
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If the guest comes from a trusted source and the guest OS kernel is
|
||||
guaranteed to have the L1TF mitigations in place the system is fully
|
||||
protected against L1TF and no further action is required.
|
||||
|
||||
To avoid the overhead of the default L1D flushing on VMENTER the
|
||||
administrator can disable the flushing via the kernel command line and
|
||||
sysfs control files. See :ref:`mitigation_control_command_line` and
|
||||
:ref:`mitigation_control_kvm`.
|
||||
|
||||
|
||||
3. Virtualization with untrusted guests
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
3.1. SMT not supported or disabled
|
||||
""""""""""""""""""""""""""""""""""
|
||||
|
||||
If SMT is not supported by the processor or disabled in the BIOS or by
|
||||
the kernel, it's only required to enforce L1D flushing on VMENTER.
|
||||
|
||||
Conditional L1D flushing is the default behaviour and can be tuned. See
|
||||
:ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
|
||||
|
||||
3.2. EPT not supported or disabled
|
||||
""""""""""""""""""""""""""""""""""
|
||||
|
||||
If EPT is not supported by the processor or disabled in the hypervisor,
|
||||
the system is fully protected. SMT can stay enabled and L1D flushing on
|
||||
VMENTER is not required.
|
||||
|
||||
EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
|
||||
|
||||
3.3. SMT and EPT supported and active
|
||||
"""""""""""""""""""""""""""""""""""""
|
||||
|
||||
If SMT and EPT are supported and active then various degrees of
|
||||
mitigations can be employed:
|
||||
|
||||
- L1D flushing on VMENTER:
|
||||
|
||||
L1D flushing on VMENTER is the minimal protection requirement, but it
|
||||
is only potent in combination with other mitigation methods.
|
||||
|
||||
Conditional L1D flushing is the default behaviour and can be tuned. See
|
||||
:ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
|
||||
|
||||
- Guest confinement:
|
||||
|
||||
Confinement of guests to a single or a group of physical cores which
|
||||
are not running any other processes, can reduce the attack surface
|
||||
significantly, but interrupts, soft interrupts and kernel threads can
|
||||
still expose valuable data to a potential attacker. See
|
||||
:ref:`guest_confinement`.
|
||||
|
||||
- Interrupt isolation:
|
||||
|
||||
Isolating the guest CPUs from interrupts can reduce the attack surface
|
||||
further, but still allows a malicious guest to explore a limited amount
|
||||
of host physical memory. This can at least be used to gain knowledge
|
||||
about the host address space layout. The interrupts which have a fixed
|
||||
affinity to the CPUs which run the untrusted guests can depending on
|
||||
the scenario still trigger soft interrupts and schedule kernel threads
|
||||
which might expose valuable information. See
|
||||
:ref:`interrupt_isolation`.
|
||||
|
||||
The above three mitigation methods combined can provide protection to a
|
||||
certain degree, but the risk of the remaining attack surface has to be
|
||||
carefully analyzed. For full protection the following methods are
|
||||
available:
|
||||
|
||||
- Disabling SMT:
|
||||
|
||||
Disabling SMT and enforcing the L1D flushing provides the maximum
|
||||
amount of protection. This mitigation is not depending on any of the
|
||||
above mitigation methods.
|
||||
|
||||
SMT control and L1D flushing can be tuned by the command line
|
||||
parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run
|
||||
time with the matching sysfs control files. See :ref:`smt_control`,
|
||||
:ref:`mitigation_control_command_line` and
|
||||
:ref:`mitigation_control_kvm`.
|
||||
|
||||
- Disabling EPT:
|
||||
|
||||
Disabling EPT provides the maximum amount of protection as well. It is
|
||||
not depending on any of the above mitigation methods. SMT can stay
|
||||
enabled and L1D flushing is not required, but the performance impact is
|
||||
significant.
|
||||
|
||||
EPT can be disabled in the hypervisor via the 'kvm-intel.ept'
|
||||
parameter.
|
||||
|
||||
3.4. Nested virtual machines
|
||||
""""""""""""""""""""""""""""
|
||||
|
||||
When nested virtualization is in use, three operating systems are involved:
|
||||
the bare metal hypervisor, the nested hypervisor and the nested virtual
|
||||
machine. VMENTER operations from the nested hypervisor into the nested
|
||||
guest will always be processed by the bare metal hypervisor. If KVM is the
|
||||
bare metal hypervisor it will:
|
||||
|
||||
- Flush the L1D cache on every switch from the nested hypervisor to the
|
||||
nested virtual machine, so that the nested hypervisor's secrets are not
|
||||
exposed to the nested virtual machine;
|
||||
|
||||
- Flush the L1D cache on every switch from the nested virtual machine to
|
||||
the nested hypervisor; this is a complex operation, and flushing the L1D
|
||||
cache avoids that the bare metal hypervisor's secrets are exposed to the
|
||||
nested virtual machine;
|
||||
|
||||
- Instruct the nested hypervisor to not perform any L1D cache flush. This
|
||||
is an optimization to avoid double L1D flushing.
|
||||
|
||||
|
||||
.. _default_mitigations:
|
||||
|
||||
Default mitigations
|
||||
-------------------
|
||||
|
||||
The kernel default mitigations for vulnerable processors are:
|
||||
|
||||
- PTE inversion to protect against malicious user space. This is done
|
||||
unconditionally and cannot be controlled. The swap storage is limited
|
||||
to ~16TB.
|
||||
|
||||
- L1D conditional flushing on VMENTER when EPT is enabled for
|
||||
a guest.
|
||||
|
||||
The kernel does not by default enforce the disabling of SMT, which leaves
|
||||
SMT systems vulnerable when running untrusted guests with EPT enabled.
|
||||
|
||||
The rationale for this choice is:
|
||||
|
||||
- Force disabling SMT can break existing setups, especially with
|
||||
unattended updates.
|
||||
|
||||
- If regular users run untrusted guests on their machine, then L1TF is
|
||||
just an add on to other malware which might be embedded in an untrusted
|
||||
guest, e.g. spam-bots or attacks on the local network.
|
||||
|
||||
There is no technical way to prevent a user from running untrusted code
|
||||
on their machines blindly.
|
||||
|
||||
- It's technically extremely unlikely and from today's knowledge even
|
||||
impossible that L1TF can be exploited via the most popular attack
|
||||
mechanisms like JavaScript because these mechanisms have no way to
|
||||
control PTEs. If this would be possible and not other mitigation would
|
||||
be possible, then the default might be different.
|
||||
|
||||
- The administrators of cloud and hosting setups have to carefully
|
||||
analyze the risk for their scenarios and make the appropriate
|
||||
mitigation choices, which might even vary across their deployed
|
||||
machines and also result in other changes of their overall setup.
|
||||
There is no way for the kernel to provide a sensible default for this
|
||||
kind of scenarios.
|
311
Documentation/hw-vuln/mds.rst
Normal file
311
Documentation/hw-vuln/mds.rst
Normal file
|
@ -0,0 +1,311 @@
|
|||
MDS - Microarchitectural Data Sampling
|
||||
======================================
|
||||
|
||||
Microarchitectural Data Sampling is a hardware vulnerability which allows
|
||||
unprivileged speculative access to data which is available in various CPU
|
||||
internal buffers.
|
||||
|
||||
Affected processors
|
||||
-------------------
|
||||
|
||||
This vulnerability affects a wide range of Intel processors. The
|
||||
vulnerability is not present on:
|
||||
|
||||
- Processors from AMD, Centaur and other non Intel vendors
|
||||
|
||||
- Older processor models, where the CPU family is < 6
|
||||
|
||||
- Some Atoms (Bonnell, Saltwell, Goldmont, GoldmontPlus)
|
||||
|
||||
- Intel processors which have the ARCH_CAP_MDS_NO bit set in the
|
||||
IA32_ARCH_CAPABILITIES MSR.
|
||||
|
||||
Whether a processor is affected or not can be read out from the MDS
|
||||
vulnerability file in sysfs. See :ref:`mds_sys_info`.
|
||||
|
||||
Not all processors are affected by all variants of MDS, but the mitigation
|
||||
is identical for all of them so the kernel treats them as a single
|
||||
vulnerability.
|
||||
|
||||
Related CVEs
|
||||
------------
|
||||
|
||||
The following CVE entries are related to the MDS vulnerability:
|
||||
|
||||
============== ===== ===================================================
|
||||
CVE-2018-12126 MSBDS Microarchitectural Store Buffer Data Sampling
|
||||
CVE-2018-12130 MFBDS Microarchitectural Fill Buffer Data Sampling
|
||||
CVE-2018-12127 MLPDS Microarchitectural Load Port Data Sampling
|
||||
CVE-2019-11091 MDSUM Microarchitectural Data Sampling Uncacheable Memory
|
||||
============== ===== ===================================================
|
||||
|
||||
Problem
|
||||
-------
|
||||
|
||||
When performing store, load, L1 refill operations, processors write data
|
||||
into temporary microarchitectural structures (buffers). The data in the
|
||||
buffer can be forwarded to load operations as an optimization.
|
||||
|
||||
Under certain conditions, usually a fault/assist caused by a load
|
||||
operation, data unrelated to the load memory address can be speculatively
|
||||
forwarded from the buffers. Because the load operation causes a fault or
|
||||
assist and its result will be discarded, the forwarded data will not cause
|
||||
incorrect program execution or state changes. But a malicious operation
|
||||
may be able to forward this speculative data to a disclosure gadget which
|
||||
allows in turn to infer the value via a cache side channel attack.
|
||||
|
||||
Because the buffers are potentially shared between Hyper-Threads cross
|
||||
Hyper-Thread attacks are possible.
|
||||
|
||||
Deeper technical information is available in the MDS specific x86
|
||||
architecture section: :ref:`Documentation/x86/mds.rst <mds>`.
|
||||
|
||||
|
||||
Attack scenarios
|
||||
----------------
|
||||
|
||||
Attacks against the MDS vulnerabilities can be mounted from malicious non
|
||||
priviledged user space applications running on hosts or guest. Malicious
|
||||
guest OSes can obviously mount attacks as well.
|
||||
|
||||
Contrary to other speculation based vulnerabilities the MDS vulnerability
|
||||
does not allow the attacker to control the memory target address. As a
|
||||
consequence the attacks are purely sampling based, but as demonstrated with
|
||||
the TLBleed attack samples can be postprocessed successfully.
|
||||
|
||||
Web-Browsers
|
||||
^^^^^^^^^^^^
|
||||
|
||||
It's unclear whether attacks through Web-Browsers are possible at
|
||||
all. The exploitation through Java-Script is considered very unlikely,
|
||||
but other widely used web technologies like Webassembly could possibly be
|
||||
abused.
|
||||
|
||||
|
||||
.. _mds_sys_info:
|
||||
|
||||
MDS system information
|
||||
-----------------------
|
||||
|
||||
The Linux kernel provides a sysfs interface to enumerate the current MDS
|
||||
status of the system: whether the system is vulnerable, and which
|
||||
mitigations are active. The relevant sysfs file is:
|
||||
|
||||
/sys/devices/system/cpu/vulnerabilities/mds
|
||||
|
||||
The possible values in this file are:
|
||||
|
||||
.. list-table::
|
||||
|
||||
* - 'Not affected'
|
||||
- The processor is not vulnerable
|
||||
* - 'Vulnerable'
|
||||
- The processor is vulnerable, but no mitigation enabled
|
||||
* - 'Vulnerable: Clear CPU buffers attempted, no microcode'
|
||||
- The processor is vulnerable but microcode is not updated.
|
||||
|
||||
The mitigation is enabled on a best effort basis. See :ref:`vmwerv`
|
||||
* - 'Mitigation: Clear CPU buffers'
|
||||
- The processor is vulnerable and the CPU buffer clearing mitigation is
|
||||
enabled.
|
||||
|
||||
If the processor is vulnerable then the following information is appended
|
||||
to the above information:
|
||||
|
||||
======================== ============================================
|
||||
'SMT vulnerable' SMT is enabled
|
||||
'SMT mitigated' SMT is enabled and mitigated
|
||||
'SMT disabled' SMT is disabled
|
||||
'SMT Host state unknown' Kernel runs in a VM, Host SMT state unknown
|
||||
======================== ============================================
|
||||
|
||||
.. _vmwerv:
|
||||
|
||||
Best effort mitigation mode
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If the processor is vulnerable, but the availability of the microcode based
|
||||
mitigation mechanism is not advertised via CPUID the kernel selects a best
|
||||
effort mitigation mode. This mode invokes the mitigation instructions
|
||||
without a guarantee that they clear the CPU buffers.
|
||||
|
||||
This is done to address virtualization scenarios where the host has the
|
||||
microcode update applied, but the hypervisor is not yet updated to expose
|
||||
the CPUID to the guest. If the host has updated microcode the protection
|
||||
takes effect otherwise a few cpu cycles are wasted pointlessly.
|
||||
|
||||
The state in the mds sysfs file reflects this situation accordingly.
|
||||
|
||||
|
||||
Mitigation mechanism
|
||||
-------------------------
|
||||
|
||||
The kernel detects the affected CPUs and the presence of the microcode
|
||||
which is required.
|
||||
|
||||
If a CPU is affected and the microcode is available, then the kernel
|
||||
enables the mitigation by default. The mitigation can be controlled at boot
|
||||
time via a kernel command line option. See
|
||||
:ref:`mds_mitigation_control_command_line`.
|
||||
|
||||
.. _cpu_buffer_clear:
|
||||
|
||||
CPU buffer clearing
|
||||
^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The mitigation for MDS clears the affected CPU buffers on return to user
|
||||
space and when entering a guest.
|
||||
|
||||
If SMT is enabled it also clears the buffers on idle entry when the CPU
|
||||
is only affected by MSBDS and not any other MDS variant, because the
|
||||
other variants cannot be protected against cross Hyper-Thread attacks.
|
||||
|
||||
For CPUs which are only affected by MSBDS the user space, guest and idle
|
||||
transition mitigations are sufficient and SMT is not affected.
|
||||
|
||||
.. _virt_mechanism:
|
||||
|
||||
Virtualization mitigation
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The protection for host to guest transition depends on the L1TF
|
||||
vulnerability of the CPU:
|
||||
|
||||
- CPU is affected by L1TF:
|
||||
|
||||
If the L1D flush mitigation is enabled and up to date microcode is
|
||||
available, the L1D flush mitigation is automatically protecting the
|
||||
guest transition.
|
||||
|
||||
If the L1D flush mitigation is disabled then the MDS mitigation is
|
||||
invoked explicit when the host MDS mitigation is enabled.
|
||||
|
||||
For details on L1TF and virtualization see:
|
||||
:ref:`Documentation/hw-vuln//l1tf.rst <mitigation_control_kvm>`.
|
||||
|
||||
- CPU is not affected by L1TF:
|
||||
|
||||
CPU buffers are flushed before entering the guest when the host MDS
|
||||
mitigation is enabled.
|
||||
|
||||
The resulting MDS protection matrix for the host to guest transition:
|
||||
|
||||
============ ===== ============= ============ =================
|
||||
L1TF MDS VMX-L1FLUSH Host MDS MDS-State
|
||||
|
||||
Don't care No Don't care N/A Not affected
|
||||
|
||||
Yes Yes Disabled Off Vulnerable
|
||||
|
||||
Yes Yes Disabled Full Mitigated
|
||||
|
||||
Yes Yes Enabled Don't care Mitigated
|
||||
|
||||
No Yes N/A Off Vulnerable
|
||||
|
||||
No Yes N/A Full Mitigated
|
||||
============ ===== ============= ============ =================
|
||||
|
||||
This only covers the host to guest transition, i.e. prevents leakage from
|
||||
host to guest, but does not protect the guest internally. Guests need to
|
||||
have their own protections.
|
||||
|
||||
.. _xeon_phi:
|
||||
|
||||
XEON PHI specific considerations
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The XEON PHI processor family is affected by MSBDS which can be exploited
|
||||
cross Hyper-Threads when entering idle states. Some XEON PHI variants allow
|
||||
to use MWAIT in user space (Ring 3) which opens an potential attack vector
|
||||
for malicious user space. The exposure can be disabled on the kernel
|
||||
command line with the 'ring3mwait=disable' command line option.
|
||||
|
||||
XEON PHI is not affected by the other MDS variants and MSBDS is mitigated
|
||||
before the CPU enters a idle state. As XEON PHI is not affected by L1TF
|
||||
either disabling SMT is not required for full protection.
|
||||
|
||||
.. _mds_smt_control:
|
||||
|
||||
SMT control
|
||||
^^^^^^^^^^^
|
||||
|
||||
All MDS variants except MSBDS can be attacked cross Hyper-Threads. That
|
||||
means on CPUs which are affected by MFBDS or MLPDS it is necessary to
|
||||
disable SMT for full protection. These are most of the affected CPUs; the
|
||||
exception is XEON PHI, see :ref:`xeon_phi`.
|
||||
|
||||
Disabling SMT can have a significant performance impact, but the impact
|
||||
depends on the type of workloads.
|
||||
|
||||
See the relevant chapter in the L1TF mitigation documentation for details:
|
||||
:ref:`Documentation/hw-vuln/l1tf.rst <smt_control>`.
|
||||
|
||||
|
||||
.. _mds_mitigation_control_command_line:
|
||||
|
||||
Mitigation control on the kernel command line
|
||||
---------------------------------------------
|
||||
|
||||
The kernel command line allows to control the MDS mitigations at boot
|
||||
time with the option "mds=". The valid arguments for this option are:
|
||||
|
||||
============ =============================================================
|
||||
full If the CPU is vulnerable, enable all available mitigations
|
||||
for the MDS vulnerability, CPU buffer clearing on exit to
|
||||
userspace and when entering a VM. Idle transitions are
|
||||
protected as well if SMT is enabled.
|
||||
|
||||
It does not automatically disable SMT.
|
||||
|
||||
full,nosmt The same as mds=full, with SMT disabled on vulnerable
|
||||
CPUs. This is the complete mitigation.
|
||||
|
||||
off Disables MDS mitigations completely.
|
||||
|
||||
============ =============================================================
|
||||
|
||||
Not specifying this option is equivalent to "mds=full". For processors
|
||||
that are affected by both TAA (TSX Asynchronous Abort) and MDS,
|
||||
specifying just "mds=off" without an accompanying "tsx_async_abort=off"
|
||||
will have no effect as the same mitigation is used for both
|
||||
vulnerabilities.
|
||||
|
||||
Mitigation selection guide
|
||||
--------------------------
|
||||
|
||||
1. Trusted userspace
|
||||
^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If all userspace applications are from a trusted source and do not
|
||||
execute untrusted code which is supplied externally, then the mitigation
|
||||
can be disabled.
|
||||
|
||||
|
||||
2. Virtualization with trusted guests
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The same considerations as above versus trusted user space apply.
|
||||
|
||||
3. Virtualization with untrusted guests
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The protection depends on the state of the L1TF mitigations.
|
||||
See :ref:`virt_mechanism`.
|
||||
|
||||
If the MDS mitigation is enabled and SMT is disabled, guest to host and
|
||||
guest to guest attacks are prevented.
|
||||
|
||||
.. _mds_default_mitigations:
|
||||
|
||||
Default mitigations
|
||||
-------------------
|
||||
|
||||
The kernel default mitigations for vulnerable processors are:
|
||||
|
||||
- Enable CPU buffer clearing
|
||||
|
||||
The kernel does not by default enforce the disabling of SMT, which leaves
|
||||
SMT systems vulnerable when running untrusted code. The same rationale as
|
||||
for L1TF applies.
|
||||
See :ref:`Documentation/hw-vuln//l1tf.rst <default_mitigations>`.
|
163
Documentation/hw-vuln/multihit.rst
Normal file
163
Documentation/hw-vuln/multihit.rst
Normal file
|
@ -0,0 +1,163 @@
|
|||
iTLB multihit
|
||||
=============
|
||||
|
||||
iTLB multihit is an erratum where some processors may incur a machine check
|
||||
error, possibly resulting in an unrecoverable CPU lockup, when an
|
||||
instruction fetch hits multiple entries in the instruction TLB. This can
|
||||
occur when the page size is changed along with either the physical address
|
||||
or cache type. A malicious guest running on a virtualized system can
|
||||
exploit this erratum to perform a denial of service attack.
|
||||
|
||||
|
||||
Affected processors
|
||||
-------------------
|
||||
|
||||
Variations of this erratum are present on most Intel Core and Xeon processor
|
||||
models. The erratum is not present on:
|
||||
|
||||
- non-Intel processors
|
||||
|
||||
- Some Atoms (Airmont, Bonnell, Goldmont, GoldmontPlus, Saltwell, Silvermont)
|
||||
|
||||
- Intel processors that have the PSCHANGE_MC_NO bit set in the
|
||||
IA32_ARCH_CAPABILITIES MSR.
|
||||
|
||||
|
||||
Related CVEs
|
||||
------------
|
||||
|
||||
The following CVE entry is related to this issue:
|
||||
|
||||
============== =================================================
|
||||
CVE-2018-12207 Machine Check Error Avoidance on Page Size Change
|
||||
============== =================================================
|
||||
|
||||
|
||||
Problem
|
||||
-------
|
||||
|
||||
Privileged software, including OS and virtual machine managers (VMM), are in
|
||||
charge of memory management. A key component in memory management is the control
|
||||
of the page tables. Modern processors use virtual memory, a technique that creates
|
||||
the illusion of a very large memory for processors. This virtual space is split
|
||||
into pages of a given size. Page tables translate virtual addresses to physical
|
||||
addresses.
|
||||
|
||||
To reduce latency when performing a virtual to physical address translation,
|
||||
processors include a structure, called TLB, that caches recent translations.
|
||||
There are separate TLBs for instruction (iTLB) and data (dTLB).
|
||||
|
||||
Under this errata, instructions are fetched from a linear address translated
|
||||
using a 4 KB translation cached in the iTLB. Privileged software modifies the
|
||||
paging structure so that the same linear address using large page size (2 MB, 4
|
||||
MB, 1 GB) with a different physical address or memory type. After the page
|
||||
structure modification but before the software invalidates any iTLB entries for
|
||||
the linear address, a code fetch that happens on the same linear address may
|
||||
cause a machine-check error which can result in a system hang or shutdown.
|
||||
|
||||
|
||||
Attack scenarios
|
||||
----------------
|
||||
|
||||
Attacks against the iTLB multihit erratum can be mounted from malicious
|
||||
guests in a virtualized system.
|
||||
|
||||
|
||||
iTLB multihit system information
|
||||
--------------------------------
|
||||
|
||||
The Linux kernel provides a sysfs interface to enumerate the current iTLB
|
||||
multihit status of the system:whether the system is vulnerable and which
|
||||
mitigations are active. The relevant sysfs file is:
|
||||
|
||||
/sys/devices/system/cpu/vulnerabilities/itlb_multihit
|
||||
|
||||
The possible values in this file are:
|
||||
|
||||
.. list-table::
|
||||
|
||||
* - Not affected
|
||||
- The processor is not vulnerable.
|
||||
* - KVM: Mitigation: Split huge pages
|
||||
- Software changes mitigate this issue.
|
||||
* - KVM: Vulnerable
|
||||
- The processor is vulnerable, but no mitigation enabled
|
||||
|
||||
|
||||
Enumeration of the erratum
|
||||
--------------------------------
|
||||
|
||||
A new bit has been allocated in the IA32_ARCH_CAPABILITIES (PSCHANGE_MC_NO) msr
|
||||
and will be set on CPU's which are mitigated against this issue.
|
||||
|
||||
======================================= =========== ===============================
|
||||
IA32_ARCH_CAPABILITIES MSR Not present Possibly vulnerable,check model
|
||||
IA32_ARCH_CAPABILITIES[PSCHANGE_MC_NO] '0' Likely vulnerable,check model
|
||||
IA32_ARCH_CAPABILITIES[PSCHANGE_MC_NO] '1' Not vulnerable
|
||||
======================================= =========== ===============================
|
||||
|
||||
|
||||
Mitigation mechanism
|
||||
-------------------------
|
||||
|
||||
This erratum can be mitigated by restricting the use of large page sizes to
|
||||
non-executable pages. This forces all iTLB entries to be 4K, and removes
|
||||
the possibility of multiple hits.
|
||||
|
||||
In order to mitigate the vulnerability, KVM initially marks all huge pages
|
||||
as non-executable. If the guest attempts to execute in one of those pages,
|
||||
the page is broken down into 4K pages, which are then marked executable.
|
||||
|
||||
If EPT is disabled or not available on the host, KVM is in control of TLB
|
||||
flushes and the problematic situation cannot happen. However, the shadow
|
||||
EPT paging mechanism used by nested virtualization is vulnerable, because
|
||||
the nested guest can trigger multiple iTLB hits by modifying its own
|
||||
(non-nested) page tables. For simplicity, KVM will make large pages
|
||||
non-executable in all shadow paging modes.
|
||||
|
||||
Mitigation control on the kernel command line and KVM - module parameter
|
||||
------------------------------------------------------------------------
|
||||
|
||||
The KVM hypervisor mitigation mechanism for marking huge pages as
|
||||
non-executable can be controlled with a module parameter "nx_huge_pages=".
|
||||
The kernel command line allows to control the iTLB multihit mitigations at
|
||||
boot time with the option "kvm.nx_huge_pages=".
|
||||
|
||||
The valid arguments for these options are:
|
||||
|
||||
========== ================================================================
|
||||
force Mitigation is enabled. In this case, the mitigation implements
|
||||
non-executable huge pages in Linux kernel KVM module. All huge
|
||||
pages in the EPT are marked as non-executable.
|
||||
If a guest attempts to execute in one of those pages, the page is
|
||||
broken down into 4K pages, which are then marked executable.
|
||||
|
||||
off Mitigation is disabled.
|
||||
|
||||
auto Enable mitigation only if the platform is affected and the kernel
|
||||
was not booted with the "mitigations=off" command line parameter.
|
||||
This is the default option.
|
||||
========== ================================================================
|
||||
|
||||
|
||||
Mitigation selection guide
|
||||
--------------------------
|
||||
|
||||
1. No virtualization in use
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The system is protected by the kernel unconditionally and no further
|
||||
action is required.
|
||||
|
||||
2. Virtualization with trusted guests
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If the guest comes from a trusted source, you may assume that the guest will
|
||||
not attempt to maliciously exploit these errata and no further action is
|
||||
required.
|
||||
|
||||
3. Virtualization with untrusted guests
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
If the guest comes from an untrusted source, the guest host kernel will need
|
||||
to apply iTLB multihit mitigation via the kernel command line or kvm
|
||||
module parameter.
|
279
Documentation/hw-vuln/tsx_async_abort.rst
Normal file
279
Documentation/hw-vuln/tsx_async_abort.rst
Normal file
|
@ -0,0 +1,279 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
TAA - TSX Asynchronous Abort
|
||||
======================================
|
||||
|
||||
TAA is a hardware vulnerability that allows unprivileged speculative access to
|
||||
data which is available in various CPU internal buffers by using asynchronous
|
||||
aborts within an Intel TSX transactional region.
|
||||
|
||||
Affected processors
|
||||
-------------------
|
||||
|
||||
This vulnerability only affects Intel processors that support Intel
|
||||
Transactional Synchronization Extensions (TSX) when the TAA_NO bit (bit 8)
|
||||
is 0 in the IA32_ARCH_CAPABILITIES MSR. On processors where the MDS_NO bit
|
||||
(bit 5) is 0 in the IA32_ARCH_CAPABILITIES MSR, the existing MDS mitigations
|
||||
also mitigate against TAA.
|
||||
|
||||
Whether a processor is affected or not can be read out from the TAA
|
||||
vulnerability file in sysfs. See :ref:`tsx_async_abort_sys_info`.
|
||||
|
||||
Related CVEs
|
||||
------------
|
||||
|
||||
The following CVE entry is related to this TAA issue:
|
||||
|
||||
============== ===== ===================================================
|
||||
CVE-2019-11135 TAA TSX Asynchronous Abort (TAA) condition on some
|
||||
microprocessors utilizing speculative execution may
|
||||
allow an authenticated user to potentially enable
|
||||
information disclosure via a side channel with
|
||||
local access.
|
||||
============== ===== ===================================================
|
||||
|
||||
Problem
|
||||
-------
|
||||
|
||||
When performing store, load or L1 refill operations, processors write
|
||||
data into temporary microarchitectural structures (buffers). The data in
|
||||
those buffers can be forwarded to load operations as an optimization.
|
||||
|
||||
Intel TSX is an extension to the x86 instruction set architecture that adds
|
||||
hardware transactional memory support to improve performance of multi-threaded
|
||||
software. TSX lets the processor expose and exploit concurrency hidden in an
|
||||
application due to dynamically avoiding unnecessary synchronization.
|
||||
|
||||
TSX supports atomic memory transactions that are either committed (success) or
|
||||
aborted. During an abort, operations that happened within the transactional region
|
||||
are rolled back. An asynchronous abort takes place, among other options, when a
|
||||
different thread accesses a cache line that is also used within the transactional
|
||||
region when that access might lead to a data race.
|
||||
|
||||
Immediately after an uncompleted asynchronous abort, certain speculatively
|
||||
executed loads may read data from those internal buffers and pass it to dependent
|
||||
operations. This can be then used to infer the value via a cache side channel
|
||||
attack.
|
||||
|
||||
Because the buffers are potentially shared between Hyper-Threads cross
|
||||
Hyper-Thread attacks are possible.
|
||||
|
||||
The victim of a malicious actor does not need to make use of TSX. Only the
|
||||
attacker needs to begin a TSX transaction and raise an asynchronous abort
|
||||
which in turn potenitally leaks data stored in the buffers.
|
||||
|
||||
More detailed technical information is available in the TAA specific x86
|
||||
architecture section: :ref:`Documentation/x86/tsx_async_abort.rst <tsx_async_abort>`.
|
||||
|
||||
|
||||
Attack scenarios
|
||||
----------------
|
||||
|
||||
Attacks against the TAA vulnerability can be implemented from unprivileged
|
||||
applications running on hosts or guests.
|
||||
|
||||
As for MDS, the attacker has no control over the memory addresses that can
|
||||
be leaked. Only the victim is responsible for bringing data to the CPU. As
|
||||
a result, the malicious actor has to sample as much data as possible and
|
||||
then postprocess it to try to infer any useful information from it.
|
||||
|
||||
A potential attacker only has read access to the data. Also, there is no direct
|
||||
privilege escalation by using this technique.
|
||||
|
||||
|
||||
.. _tsx_async_abort_sys_info:
|
||||
|
||||
TAA system information
|
||||
-----------------------
|
||||
|
||||
The Linux kernel provides a sysfs interface to enumerate the current TAA status
|
||||
of mitigated systems. The relevant sysfs file is:
|
||||
|
||||
/sys/devices/system/cpu/vulnerabilities/tsx_async_abort
|
||||
|
||||
The possible values in this file are:
|
||||
|
||||
.. list-table::
|
||||
|
||||
* - 'Vulnerable'
|
||||
- The CPU is affected by this vulnerability and the microcode and kernel mitigation are not applied.
|
||||
* - 'Vulnerable: Clear CPU buffers attempted, no microcode'
|
||||
- The system tries to clear the buffers but the microcode might not support the operation.
|
||||
* - 'Mitigation: Clear CPU buffers'
|
||||
- The microcode has been updated to clear the buffers. TSX is still enabled.
|
||||
* - 'Mitigation: TSX disabled'
|
||||
- TSX is disabled.
|
||||
* - 'Not affected'
|
||||
- The CPU is not affected by this issue.
|
||||
|
||||
.. _ucode_needed:
|
||||
|
||||
Best effort mitigation mode
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If the processor is vulnerable, but the availability of the microcode-based
|
||||
mitigation mechanism is not advertised via CPUID the kernel selects a best
|
||||
effort mitigation mode. This mode invokes the mitigation instructions
|
||||
without a guarantee that they clear the CPU buffers.
|
||||
|
||||
This is done to address virtualization scenarios where the host has the
|
||||
microcode update applied, but the hypervisor is not yet updated to expose the
|
||||
CPUID to the guest. If the host has updated microcode the protection takes
|
||||
effect; otherwise a few CPU cycles are wasted pointlessly.
|
||||
|
||||
The state in the tsx_async_abort sysfs file reflects this situation
|
||||
accordingly.
|
||||
|
||||
|
||||
Mitigation mechanism
|
||||
--------------------
|
||||
|
||||
The kernel detects the affected CPUs and the presence of the microcode which is
|
||||
required. If a CPU is affected and the microcode is available, then the kernel
|
||||
enables the mitigation by default.
|
||||
|
||||
|
||||
The mitigation can be controlled at boot time via a kernel command line option.
|
||||
See :ref:`taa_mitigation_control_command_line`.
|
||||
|
||||
.. _virt_mechanism:
|
||||
|
||||
Virtualization mitigation
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Affected systems where the host has TAA microcode and TAA is mitigated by
|
||||
having disabled TSX previously, are not vulnerable regardless of the status
|
||||
of the VMs.
|
||||
|
||||
In all other cases, if the host either does not have the TAA microcode or
|
||||
the kernel is not mitigated, the system might be vulnerable.
|
||||
|
||||
|
||||
.. _taa_mitigation_control_command_line:
|
||||
|
||||
Mitigation control on the kernel command line
|
||||
---------------------------------------------
|
||||
|
||||
The kernel command line allows to control the TAA mitigations at boot time with
|
||||
the option "tsx_async_abort=". The valid arguments for this option are:
|
||||
|
||||
============ =============================================================
|
||||
off This option disables the TAA mitigation on affected platforms.
|
||||
If the system has TSX enabled (see next parameter) and the CPU
|
||||
is affected, the system is vulnerable.
|
||||
|
||||
full TAA mitigation is enabled. If TSX is enabled, on an affected
|
||||
system it will clear CPU buffers on ring transitions. On
|
||||
systems which are MDS-affected and deploy MDS mitigation,
|
||||
TAA is also mitigated. Specifying this option on those
|
||||
systems will have no effect.
|
||||
|
||||
full,nosmt The same as tsx_async_abort=full, with SMT disabled on
|
||||
vulnerable CPUs that have TSX enabled. This is the complete
|
||||
mitigation. When TSX is disabled, SMT is not disabled because
|
||||
CPU is not vulnerable to cross-thread TAA attacks.
|
||||
============ =============================================================
|
||||
|
||||
Not specifying this option is equivalent to "tsx_async_abort=full". For
|
||||
processors that are affected by both TAA and MDS, specifying just
|
||||
"tsx_async_abort=off" without an accompanying "mds=off" will have no
|
||||
effect as the same mitigation is used for both vulnerabilities.
|
||||
|
||||
The kernel command line also allows to control the TSX feature using the
|
||||
parameter "tsx=" on CPUs which support TSX control. MSR_IA32_TSX_CTRL is used
|
||||
to control the TSX feature and the enumeration of the TSX feature bits (RTM
|
||||
and HLE) in CPUID.
|
||||
|
||||
The valid options are:
|
||||
|
||||
============ =============================================================
|
||||
off Disables TSX on the system.
|
||||
|
||||
Note that this option takes effect only on newer CPUs which are
|
||||
not vulnerable to MDS, i.e., have MSR_IA32_ARCH_CAPABILITIES.MDS_NO=1
|
||||
and which get the new IA32_TSX_CTRL MSR through a microcode
|
||||
update. This new MSR allows for the reliable deactivation of
|
||||
the TSX functionality.
|
||||
|
||||
on Enables TSX.
|
||||
|
||||
Although there are mitigations for all known security
|
||||
vulnerabilities, TSX has been known to be an accelerator for
|
||||
several previous speculation-related CVEs, and so there may be
|
||||
unknown security risks associated with leaving it enabled.
|
||||
|
||||
auto Disables TSX if X86_BUG_TAA is present, otherwise enables TSX
|
||||
on the system.
|
||||
============ =============================================================
|
||||
|
||||
Not specifying this option is equivalent to "tsx=off".
|
||||
|
||||
The following combinations of the "tsx_async_abort" and "tsx" are possible. For
|
||||
affected platforms tsx=auto is equivalent to tsx=off and the result will be:
|
||||
|
||||
========= ========================== =========================================
|
||||
tsx=on tsx_async_abort=full The system will use VERW to clear CPU
|
||||
buffers. Cross-thread attacks are still
|
||||
possible on SMT machines.
|
||||
tsx=on tsx_async_abort=full,nosmt As above, cross-thread attacks on SMT
|
||||
mitigated.
|
||||
tsx=on tsx_async_abort=off The system is vulnerable.
|
||||
tsx=off tsx_async_abort=full TSX might be disabled if microcode
|
||||
provides a TSX control MSR. If so,
|
||||
system is not vulnerable.
|
||||
tsx=off tsx_async_abort=full,nosmt Ditto
|
||||
tsx=off tsx_async_abort=off ditto
|
||||
========= ========================== =========================================
|
||||
|
||||
|
||||
For unaffected platforms "tsx=on" and "tsx_async_abort=full" does not clear CPU
|
||||
buffers. For platforms without TSX control (MSR_IA32_ARCH_CAPABILITIES.MDS_NO=0)
|
||||
"tsx" command line argument has no effect.
|
||||
|
||||
For the affected platforms below table indicates the mitigation status for the
|
||||
combinations of CPUID bit MD_CLEAR and IA32_ARCH_CAPABILITIES MSR bits MDS_NO
|
||||
and TSX_CTRL_MSR.
|
||||
|
||||
======= ========= ============= ========================================
|
||||
MDS_NO MD_CLEAR TSX_CTRL_MSR Status
|
||||
======= ========= ============= ========================================
|
||||
0 0 0 Vulnerable (needs microcode)
|
||||
0 1 0 MDS and TAA mitigated via VERW
|
||||
1 1 0 MDS fixed, TAA vulnerable if TSX enabled
|
||||
because MD_CLEAR has no meaning and
|
||||
VERW is not guaranteed to clear buffers
|
||||
1 X 1 MDS fixed, TAA can be mitigated by
|
||||
VERW or TSX_CTRL_MSR
|
||||
======= ========= ============= ========================================
|
||||
|
||||
Mitigation selection guide
|
||||
--------------------------
|
||||
|
||||
1. Trusted userspace and guests
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If all user space applications are from a trusted source and do not execute
|
||||
untrusted code which is supplied externally, then the mitigation can be
|
||||
disabled. The same applies to virtualized environments with trusted guests.
|
||||
|
||||
|
||||
2. Untrusted userspace and guests
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If there are untrusted applications or guests on the system, enabling TSX
|
||||
might allow a malicious actor to leak data from the host or from other
|
||||
processes running on the same physical core.
|
||||
|
||||
If the microcode is available and the TSX is disabled on the host, attacks
|
||||
are prevented in a virtualized environment as well, even if the VMs do not
|
||||
explicitly enable the mitigation.
|
||||
|
||||
|
||||
.. _taa_default_mitigations:
|
||||
|
||||
Default mitigations
|
||||
-------------------
|
||||
|
||||
The kernel's default action for vulnerable processors is:
|
||||
|
||||
- Deploy TSX disable mitigation (tsx_async_abort=full tsx=off).
|
|
@ -32,7 +32,7 @@ Supported chips:
|
|||
Datasheet: Publicly available at the Texas Instruments website
|
||||
http://www.ti.com/
|
||||
|
||||
Author: Lothar Felten <l-felten@ti.com>
|
||||
Author: Lothar Felten <lothar.felten@gmail.com>
|
||||
|
||||
Description
|
||||
-----------
|
||||
|
|
|
@ -19,6 +19,24 @@ Contents:
|
|||
gpu/index
|
||||
80211/index
|
||||
|
||||
This section describes CPU vulnerabilities and their mitigations.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
hw-vuln/index
|
||||
|
||||
Architecture-specific documentation
|
||||
-----------------------------------
|
||||
|
||||
These books provide programming details about architecture-specific
|
||||
implementation.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
x86/index
|
||||
|
||||
Indices and tables
|
||||
==================
|
||||
|
||||
|
|
|
@ -301,7 +301,10 @@ them as any other INPUT_PROP_BUTTONPAD device.
|
|||
INPUT_PROP_ACCELEROMETER
|
||||
-------------------------
|
||||
Directional axes on this device (absolute and/or relative x, y, z) represent
|
||||
accelerometer data. All other axes retain their meaning. A device must not mix
|
||||
accelerometer data. Some devices also report gyroscope data, which devices
|
||||
can report through the rotational axes (absolute and/or relative rx, ry, rz).
|
||||
|
||||
All other axes retain their meaning. A device must not mix
|
||||
regular directional axes and accelerometer axes on the same event node.
|
||||
|
||||
Guidelines:
|
||||
|
|
|
@ -314,7 +314,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|||
This facility can be used to prevent such uncontrolled
|
||||
GPE floodings.
|
||||
Format: <int>
|
||||
Support masking of GPEs numbered from 0x00 to 0x7f.
|
||||
|
||||
acpi_no_auto_serialize [HW,ACPI]
|
||||
Disable auto-serialization of AML methods
|
||||
|
@ -1090,12 +1089,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|||
nopku [X86] Disable Memory Protection Keys CPU feature found
|
||||
in some Intel CPUs.
|
||||
|
||||
eagerfpu= [X86]
|
||||
on enable eager fpu restore
|
||||
off disable eager fpu restore
|
||||
auto selects the default scheme, which automatically
|
||||
enables eagerfpu restore for xsaveopt.
|
||||
|
||||
module.async_probe [KNL]
|
||||
Enable asynchronous probe on this module.
|
||||
|
||||
|
@ -1984,6 +1977,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|||
kmemcheck=2 (one-shot mode)
|
||||
Default: 2 (one-shot mode)
|
||||
|
||||
kpti= [ARM64] Control page table isolation of user
|
||||
and kernel address spaces.
|
||||
Default: enabled on cores which need mitigation.
|
||||
0: force disabled
|
||||
1: force enabled
|
||||
|
||||
kstack=N [X86] Print N words from the kernel stack
|
||||
in oops dumps.
|
||||
|
||||
|
@ -1994,6 +1993,25 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|||
KVM MMU at runtime.
|
||||
Default is 0 (off)
|
||||
|
||||
kvm.nx_huge_pages=
|
||||
[KVM] Controls the software workaround for the
|
||||
X86_BUG_ITLB_MULTIHIT bug.
|
||||
force : Always deploy workaround.
|
||||
off : Never deploy workaround.
|
||||
auto : Deploy workaround based on the presence of
|
||||
X86_BUG_ITLB_MULTIHIT.
|
||||
|
||||
Default is 'auto'.
|
||||
|
||||
If the software workaround is enabled for the host,
|
||||
guests do need not to enable it for nested guests.
|
||||
|
||||
kvm.nx_huge_pages_recovery_ratio=
|
||||
[KVM] Controls how many 4KiB pages are periodically zapped
|
||||
back to huge pages. 0 disables the recovery, otherwise if
|
||||
the value is N KVM will zap 1/Nth of the 4KiB pages every
|
||||
minute. The default is 60.
|
||||
|
||||
kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM.
|
||||
Default is 1 (enabled)
|
||||
|
||||
|
@ -2022,10 +2040,87 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|||
(virtualized real and unpaged mode) on capable
|
||||
Intel chips. Default is 1 (enabled)
|
||||
|
||||
kvm-intel.vmentry_l1d_flush=[KVM,Intel] Mitigation for L1 Terminal Fault
|
||||
CVE-2018-3620.
|
||||
|
||||
Valid arguments: never, cond, always
|
||||
|
||||
always: L1D cache flush on every VMENTER.
|
||||
cond: Flush L1D on VMENTER only when the code between
|
||||
VMEXIT and VMENTER can leak host memory.
|
||||
never: Disables the mitigation
|
||||
|
||||
Default is cond (do L1 cache flush in specific instances)
|
||||
|
||||
kvm-intel.vpid= [KVM,Intel] Disable Virtual Processor Identification
|
||||
feature (tagged TLBs) on capable Intel chips.
|
||||
Default is 1 (enabled)
|
||||
|
||||
l1tf= [X86] Control mitigation of the L1TF vulnerability on
|
||||
affected CPUs
|
||||
|
||||
The kernel PTE inversion protection is unconditionally
|
||||
enabled and cannot be disabled.
|
||||
|
||||
full
|
||||
Provides all available mitigations for the
|
||||
L1TF vulnerability. Disables SMT and
|
||||
enables all mitigations in the
|
||||
hypervisors, i.e. unconditional L1D flush.
|
||||
|
||||
SMT control and L1D flush control via the
|
||||
sysfs interface is still possible after
|
||||
boot. Hypervisors will issue a warning
|
||||
when the first VM is started in a
|
||||
potentially insecure configuration,
|
||||
i.e. SMT enabled or L1D flush disabled.
|
||||
|
||||
full,force
|
||||
Same as 'full', but disables SMT and L1D
|
||||
flush runtime control. Implies the
|
||||
'nosmt=force' command line option.
|
||||
(i.e. sysfs control of SMT is disabled.)
|
||||
|
||||
flush
|
||||
Leaves SMT enabled and enables the default
|
||||
hypervisor mitigation, i.e. conditional
|
||||
L1D flush.
|
||||
|
||||
SMT control and L1D flush control via the
|
||||
sysfs interface is still possible after
|
||||
boot. Hypervisors will issue a warning
|
||||
when the first VM is started in a
|
||||
potentially insecure configuration,
|
||||
i.e. SMT enabled or L1D flush disabled.
|
||||
|
||||
flush,nosmt
|
||||
|
||||
Disables SMT and enables the default
|
||||
hypervisor mitigation.
|
||||
|
||||
SMT control and L1D flush control via the
|
||||
sysfs interface is still possible after
|
||||
boot. Hypervisors will issue a warning
|
||||
when the first VM is started in a
|
||||
potentially insecure configuration,
|
||||
i.e. SMT enabled or L1D flush disabled.
|
||||
|
||||
flush,nowarn
|
||||
Same as 'flush', but hypervisors will not
|
||||
warn when a VM is started in a potentially
|
||||
insecure configuration.
|
||||
|
||||
off
|
||||
Disables hypervisor mitigations and doesn't
|
||||
emit any warnings.
|
||||
It also drops the swap size and available
|
||||
RAM limit restriction on both hypervisor and
|
||||
bare metal.
|
||||
|
||||
Default is 'flush'.
|
||||
|
||||
For details see: Documentation/hw-vuln/l1tf.rst
|
||||
|
||||
l2cr= [PPC]
|
||||
|
||||
l3cr= [PPC]
|
||||
|
@ -2267,6 +2362,38 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|||
Format: <first>,<last>
|
||||
Specifies range of consoles to be captured by the MDA.
|
||||
|
||||
mds= [X86,INTEL]
|
||||
Control mitigation for the Micro-architectural Data
|
||||
Sampling (MDS) vulnerability.
|
||||
|
||||
Certain CPUs are vulnerable to an exploit against CPU
|
||||
internal buffers which can forward information to a
|
||||
disclosure gadget under certain conditions.
|
||||
|
||||
In vulnerable processors, the speculatively
|
||||
forwarded data can be used in a cache side channel
|
||||
attack, to access data to which the attacker does
|
||||
not have direct access.
|
||||
|
||||
This parameter controls the MDS mitigation. The
|
||||
options are:
|
||||
|
||||
full - Enable MDS mitigation on vulnerable CPUs
|
||||
full,nosmt - Enable MDS mitigation and disable
|
||||
SMT on vulnerable CPUs
|
||||
off - Unconditionally disable MDS mitigation
|
||||
|
||||
On TAA-affected machines, mds=off can be prevented by
|
||||
an active TAA mitigation as both vulnerabilities are
|
||||
mitigated with the same mechanism so in order to disable
|
||||
this mitigation, you need to specify tsx_async_abort=off
|
||||
too.
|
||||
|
||||
Not specifying this option is equivalent to
|
||||
mds=full.
|
||||
|
||||
For details see: Documentation/hw-vuln/mds.rst
|
||||
|
||||
mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory
|
||||
Amount of memory to be used when the kernel is not able
|
||||
to see the whole system memory or for test.
|
||||
|
@ -2389,6 +2516,47 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|||
in the "bleeding edge" mini2440 support kernel at
|
||||
http://repo.or.cz/w/linux-2.6/mini2440.git
|
||||
|
||||
mitigations=
|
||||
[X86] Control optional mitigations for CPU
|
||||
vulnerabilities. This is a set of curated,
|
||||
arch-independent options, each of which is an
|
||||
aggregation of existing arch-specific options.
|
||||
|
||||
off
|
||||
Disable all optional CPU mitigations. This
|
||||
improves system performance, but it may also
|
||||
expose users to several CPU vulnerabilities.
|
||||
Equivalent to: nopti [X86]
|
||||
nospectre_v1 [X86]
|
||||
nospectre_v2 [X86]
|
||||
spectre_v2_user=off [X86]
|
||||
spec_store_bypass_disable=off [X86]
|
||||
l1tf=off [X86]
|
||||
mds=off [X86]
|
||||
tsx_async_abort=off [X86]
|
||||
kvm.nx_huge_pages=off [X86]
|
||||
|
||||
Exceptions:
|
||||
This does not have any effect on
|
||||
kvm.nx_huge_pages when
|
||||
kvm.nx_huge_pages=force.
|
||||
|
||||
auto (default)
|
||||
Mitigate all CPU vulnerabilities, but leave SMT
|
||||
enabled, even if it's vulnerable. This is for
|
||||
users who don't want to be surprised by SMT
|
||||
getting disabled across kernel upgrades, or who
|
||||
have other ways of avoiding SMT-based attacks.
|
||||
Equivalent to: (default behavior)
|
||||
|
||||
auto,nosmt
|
||||
Mitigate all CPU vulnerabilities, disabling SMT
|
||||
if needed. This is for users who always want to
|
||||
be fully mitigated, even if it means losing SMT.
|
||||
Equivalent to: l1tf=flush,nosmt [X86]
|
||||
mds=full,nosmt [X86]
|
||||
tsx_async_abort=full,nosmt [X86]
|
||||
|
||||
mminit_loglevel=
|
||||
[KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
|
||||
parameter allows control of the logging verbosity for
|
||||
|
@ -2706,7 +2874,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|||
nosmt [KNL,S390] Disable symmetric multithreading (SMT).
|
||||
Equivalent to smt=1.
|
||||
|
||||
nospectre_v2 [X86] Disable all mitigations for the Spectre variant 2
|
||||
[KNL,x86] Disable symmetric multithreading (SMT).
|
||||
nosmt=force: Force disable SMT, cannot be undone
|
||||
via the sysfs control file.
|
||||
|
||||
nospectre_v1 [X86,PPC] Disable mitigations for Spectre Variant 1
|
||||
(bounds check bypass). With this option data leaks are
|
||||
possible in the system.
|
||||
|
||||
nospectre_v2 [X86,PPC_FSL_BOOK3E] Disable all mitigations for the Spectre variant 2
|
||||
(indirect branch prediction) vulnerability. System may
|
||||
allow data leaks with this option, which is equivalent
|
||||
to spectre_v2=off.
|
||||
|
@ -3328,6 +3504,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|||
before loading.
|
||||
See Documentation/blockdev/ramdisk.txt.
|
||||
|
||||
psi= [KNL] Enable or disable pressure stall information
|
||||
tracking.
|
||||
Format: <bool>
|
||||
|
||||
psmouse.proto= [HW,MOUSE] Highest PS2 mouse protocol extension to
|
||||
probe for; one of (bare|imps|exps|lifebook|any).
|
||||
psmouse.rate= [HW,MOUSE] Set desired mouse report rate, in reports
|
||||
|
@ -3704,6 +3884,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|||
Run specified binary instead of /init from the ramdisk,
|
||||
used for early userspace startup. See initrd.
|
||||
|
||||
rdrand= [X86]
|
||||
force - Override the decision by the kernel to hide the
|
||||
advertisement of RDRAND support (this affects
|
||||
certain AMD processors because of buggy BIOS
|
||||
support, specifically around the suspend/resume
|
||||
path).
|
||||
|
||||
reboot= [KNL]
|
||||
Format (x86 or x86_64):
|
||||
[w[arm] | c[old] | h[ard] | s[oft] | g[pio]] \
|
||||
|
@ -3908,6 +4095,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|||
last alloc / free. For more information see
|
||||
Documentation/vm/slub.txt.
|
||||
|
||||
slub_memcg_sysfs= [MM, SLUB]
|
||||
Determines whether to enable sysfs directories for
|
||||
memory cgroup sub-caches. 1 to enable, 0 to disable.
|
||||
The default is determined by CONFIG_SLUB_MEMCG_SYSFS_ON.
|
||||
Enabling this can lead to a very high number of debug
|
||||
directories and files being created under
|
||||
/sys/kernel/slub.
|
||||
|
||||
slub_max_order= [MM, SLUB]
|
||||
Determines the maximum allowed order for slabs.
|
||||
A high setting may cause OOMs due to memory
|
||||
|
@ -3967,9 +4162,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|||
|
||||
spectre_v2= [X86] Control mitigation of Spectre variant 2
|
||||
(indirect branch speculation) vulnerability.
|
||||
The default operation protects the kernel from
|
||||
user space attacks.
|
||||
|
||||
on - unconditionally enable
|
||||
off - unconditionally disable
|
||||
on - unconditionally enable, implies
|
||||
spectre_v2_user=on
|
||||
off - unconditionally disable, implies
|
||||
spectre_v2_user=off
|
||||
auto - kernel detects whether your CPU model is
|
||||
vulnerable
|
||||
|
||||
|
@ -3979,6 +4178,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|||
CONFIG_RETPOLINE configuration option, and the
|
||||
compiler with which the kernel was built.
|
||||
|
||||
Selecting 'on' will also enable the mitigation
|
||||
against user space to user space task attacks.
|
||||
|
||||
Selecting 'off' will disable both the kernel and
|
||||
the user space protections.
|
||||
|
||||
Specific mitigations can also be selected manually:
|
||||
|
||||
retpoline - replace indirect branches
|
||||
|
@ -3988,6 +4193,48 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|||
Not specifying this option is equivalent to
|
||||
spectre_v2=auto.
|
||||
|
||||
spectre_v2_user=
|
||||
[X86] Control mitigation of Spectre variant 2
|
||||
(indirect branch speculation) vulnerability between
|
||||
user space tasks
|
||||
|
||||
on - Unconditionally enable mitigations. Is
|
||||
enforced by spectre_v2=on
|
||||
|
||||
off - Unconditionally disable mitigations. Is
|
||||
enforced by spectre_v2=off
|
||||
|
||||
prctl - Indirect branch speculation is enabled,
|
||||
but mitigation can be enabled via prctl
|
||||
per thread. The mitigation control state
|
||||
is inherited on fork.
|
||||
|
||||
prctl,ibpb
|
||||
- Like "prctl" above, but only STIBP is
|
||||
controlled per thread. IBPB is issued
|
||||
always when switching between different user
|
||||
space processes.
|
||||
|
||||
seccomp
|
||||
- Same as "prctl" above, but all seccomp
|
||||
threads will enable the mitigation unless
|
||||
they explicitly opt out.
|
||||
|
||||
seccomp,ibpb
|
||||
- Like "seccomp" above, but only STIBP is
|
||||
controlled per thread. IBPB is issued
|
||||
always when switching between different
|
||||
user space processes.
|
||||
|
||||
auto - Kernel selects the mitigation depending on
|
||||
the available CPU features and vulnerability.
|
||||
|
||||
Default mitigation:
|
||||
If CONFIG_SECCOMP=y then "seccomp", otherwise "prctl"
|
||||
|
||||
Not specifying this option is equivalent to
|
||||
spectre_v2_user=auto.
|
||||
|
||||
spec_store_bypass_disable=
|
||||
[HW] Control Speculative Store Bypass (SSB) Disable mitigation
|
||||
(Speculative Store Bypass vulnerability)
|
||||
|
@ -4332,6 +4579,76 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|||
platforms where RDTSC is slow and this accounting
|
||||
can add overhead.
|
||||
|
||||
tsx= [X86] Control Transactional Synchronization
|
||||
Extensions (TSX) feature in Intel processors that
|
||||
support TSX control.
|
||||
|
||||
This parameter controls the TSX feature. The options are:
|
||||
|
||||
on - Enable TSX on the system. Although there are
|
||||
mitigations for all known security vulnerabilities,
|
||||
TSX has been known to be an accelerator for
|
||||
several previous speculation-related CVEs, and
|
||||
so there may be unknown security risks associated
|
||||
with leaving it enabled.
|
||||
|
||||
off - Disable TSX on the system. (Note that this
|
||||
option takes effect only on newer CPUs which are
|
||||
not vulnerable to MDS, i.e., have
|
||||
MSR_IA32_ARCH_CAPABILITIES.MDS_NO=1 and which get
|
||||
the new IA32_TSX_CTRL MSR through a microcode
|
||||
update. This new MSR allows for the reliable
|
||||
deactivation of the TSX functionality.)
|
||||
|
||||
auto - Disable TSX if X86_BUG_TAA is present,
|
||||
otherwise enable TSX on the system.
|
||||
|
||||
Not specifying this option is equivalent to tsx=off.
|
||||
|
||||
See Documentation/hw-vuln/tsx_async_abort.rst
|
||||
for more details.
|
||||
|
||||
tsx_async_abort= [X86,INTEL] Control mitigation for the TSX Async
|
||||
Abort (TAA) vulnerability.
|
||||
|
||||
Similar to Micro-architectural Data Sampling (MDS)
|
||||
certain CPUs that support Transactional
|
||||
Synchronization Extensions (TSX) are vulnerable to an
|
||||
exploit against CPU internal buffers which can forward
|
||||
information to a disclosure gadget under certain
|
||||
conditions.
|
||||
|
||||
In vulnerable processors, the speculatively forwarded
|
||||
data can be used in a cache side channel attack, to
|
||||
access data to which the attacker does not have direct
|
||||
access.
|
||||
|
||||
This parameter controls the TAA mitigation. The
|
||||
options are:
|
||||
|
||||
full - Enable TAA mitigation on vulnerable CPUs
|
||||
if TSX is enabled.
|
||||
|
||||
full,nosmt - Enable TAA mitigation and disable SMT on
|
||||
vulnerable CPUs. If TSX is disabled, SMT
|
||||
is not disabled because CPU is not
|
||||
vulnerable to cross-thread TAA attacks.
|
||||
off - Unconditionally disable TAA mitigation
|
||||
|
||||
On MDS-affected machines, tsx_async_abort=off can be
|
||||
prevented by an active MDS mitigation as both vulnerabilities
|
||||
are mitigated with the same mechanism so in order to disable
|
||||
this mitigation, you need to specify mds=off too.
|
||||
|
||||
Not specifying this option is equivalent to
|
||||
tsx_async_abort=full. On CPUs which are MDS affected
|
||||
and deploy MDS mitigation, TAA mitigation is not
|
||||
required and doesn't provide any additional
|
||||
mitigation.
|
||||
|
||||
For details see:
|
||||
Documentation/hw-vuln/tsx_async_abort.rst
|
||||
|
||||
turbografx.map[2|3]= [HW,JOY]
|
||||
TurboGraFX parallel port interface
|
||||
Format:
|
||||
|
|
|
@ -122,14 +122,11 @@ min_adv_mss - INTEGER
|
|||
|
||||
IP Fragmentation:
|
||||
|
||||
ipfrag_high_thresh - INTEGER
|
||||
Maximum memory used to reassemble IP fragments. When
|
||||
ipfrag_high_thresh bytes of memory is allocated for this purpose,
|
||||
the fragment handler will toss packets until ipfrag_low_thresh
|
||||
is reached. This also serves as a maximum limit to namespaces
|
||||
different from the initial one.
|
||||
ipfrag_high_thresh - LONG INTEGER
|
||||
Maximum memory used to reassemble IP fragments.
|
||||
|
||||
ipfrag_low_thresh - INTEGER
|
||||
ipfrag_low_thresh - LONG INTEGER
|
||||
(Obsolete since linux-4.17)
|
||||
Maximum memory used to reassemble IP fragments before the kernel
|
||||
begins to remove incomplete fragment queues to free up resources.
|
||||
The kernel still accepts new fragments for defragmentation.
|
||||
|
@ -233,6 +230,14 @@ tcp_base_mss - INTEGER
|
|||
Path MTU discovery (MTU probing). If MTU probing is enabled,
|
||||
this is the initial MSS used by the connection.
|
||||
|
||||
tcp_min_snd_mss - INTEGER
|
||||
TCP SYN and SYNACK messages usually advertise an ADVMSS option,
|
||||
as described in RFC 1122 and RFC 6691.
|
||||
If this ADVMSS option is smaller than tcp_min_snd_mss,
|
||||
it is silently capped to tcp_min_snd_mss.
|
||||
|
||||
Default : 48 (at least 8 bytes of payload per segment)
|
||||
|
||||
tcp_congestion_control - STRING
|
||||
Set the congestion control algorithm to be used for new
|
||||
connections. The algorithm "reno" is always available, but
|
||||
|
@ -408,6 +413,7 @@ tcp_min_rtt_wlen - INTEGER
|
|||
minimum RTT when it is moved to a longer path (e.g., due to traffic
|
||||
engineering). A longer window makes the filter more resistant to RTT
|
||||
inflations such as transient congestion. The unit is seconds.
|
||||
Possible values: 0 - 86400 (1 day)
|
||||
Default: 300
|
||||
|
||||
tcp_moderate_rcvbuf - BOOLEAN
|
||||
|
|
|
@ -31,6 +31,15 @@ return from vsnprintf.
|
|||
Raw pointer value SHOULD be printed with %p. The kernel supports
|
||||
the following extended format specifiers for pointer types:
|
||||
|
||||
Pointer Types:
|
||||
|
||||
Pointers printed without a specifier extension (i.e unadorned %p) are
|
||||
hashed to give a unique identifier without leaking kernel addresses to user
|
||||
space. On 64 bit machines the first 32 bits are zeroed. If you _really_
|
||||
want the address see %px below.
|
||||
|
||||
%p abcdef12 or 00000000abcdef12
|
||||
|
||||
Symbols/Function Pointers:
|
||||
|
||||
%pF versatile_init+0x0/0x110
|
||||
|
@ -58,12 +67,24 @@ Symbols/Function Pointers:
|
|||
|
||||
Kernel Pointers:
|
||||
|
||||
%pK 0x01234567 or 0x0123456789abcdef
|
||||
%pK 01234567 or 0123456789abcdef
|
||||
|
||||
For printing kernel pointers which should be hidden from unprivileged
|
||||
users. The behaviour of %pK depends on the kptr_restrict sysctl - see
|
||||
Documentation/sysctl/kernel.txt for more details.
|
||||
|
||||
Unmodified Addresses:
|
||||
|
||||
%px 01234567 or 0123456789abcdef
|
||||
|
||||
For printing pointers when you _really_ want to print the address. Please
|
||||
consider whether or not you are leaking sensitive information about the
|
||||
Kernel layout in memory before printing pointers with %px. %px is
|
||||
functionally equivalent to %lx. %px is preferred to %lx because it is more
|
||||
uniquely grep'able. If, in the future, we need to modify the way the Kernel
|
||||
handles printing pointers it will be nice to be able to find the call
|
||||
sites.
|
||||
|
||||
Struct Resources:
|
||||
|
||||
%pr [mem 0x60000000-0x6fffffff flags 0x2200] or
|
||||
|
|
175
Documentation/siphash.txt
Normal file
175
Documentation/siphash.txt
Normal file
|
@ -0,0 +1,175 @@
|
|||
SipHash - a short input PRF
|
||||
-----------------------------------------------
|
||||
Written by Jason A. Donenfeld <jason@zx2c4.com>
|
||||
|
||||
SipHash is a cryptographically secure PRF -- a keyed hash function -- that
|
||||
performs very well for short inputs, hence the name. It was designed by
|
||||
cryptographers Daniel J. Bernstein and Jean-Philippe Aumasson. It is intended
|
||||
as a replacement for some uses of: `jhash`, `md5_transform`, `sha_transform`,
|
||||
and so forth.
|
||||
|
||||
SipHash takes a secret key filled with randomly generated numbers and either
|
||||
an input buffer or several input integers. It spits out an integer that is
|
||||
indistinguishable from random. You may then use that integer as part of secure
|
||||
sequence numbers, secure cookies, or mask it off for use in a hash table.
|
||||
|
||||
1. Generating a key
|
||||
|
||||
Keys should always be generated from a cryptographically secure source of
|
||||
random numbers, either using get_random_bytes or get_random_once:
|
||||
|
||||
siphash_key_t key;
|
||||
get_random_bytes(&key, sizeof(key));
|
||||
|
||||
If you're not deriving your key from here, you're doing it wrong.
|
||||
|
||||
2. Using the functions
|
||||
|
||||
There are two variants of the function, one that takes a list of integers, and
|
||||
one that takes a buffer:
|
||||
|
||||
u64 siphash(const void *data, size_t len, const siphash_key_t *key);
|
||||
|
||||
And:
|
||||
|
||||
u64 siphash_1u64(u64, const siphash_key_t *key);
|
||||
u64 siphash_2u64(u64, u64, const siphash_key_t *key);
|
||||
u64 siphash_3u64(u64, u64, u64, const siphash_key_t *key);
|
||||
u64 siphash_4u64(u64, u64, u64, u64, const siphash_key_t *key);
|
||||
u64 siphash_1u32(u32, const siphash_key_t *key);
|
||||
u64 siphash_2u32(u32, u32, const siphash_key_t *key);
|
||||
u64 siphash_3u32(u32, u32, u32, const siphash_key_t *key);
|
||||
u64 siphash_4u32(u32, u32, u32, u32, const siphash_key_t *key);
|
||||
|
||||
If you pass the generic siphash function something of a constant length, it
|
||||
will constant fold at compile-time and automatically choose one of the
|
||||
optimized functions.
|
||||
|
||||
3. Hashtable key function usage:
|
||||
|
||||
struct some_hashtable {
|
||||
DECLARE_HASHTABLE(hashtable, 8);
|
||||
siphash_key_t key;
|
||||
};
|
||||
|
||||
void init_hashtable(struct some_hashtable *table)
|
||||
{
|
||||
get_random_bytes(&table->key, sizeof(table->key));
|
||||
}
|
||||
|
||||
static inline hlist_head *some_hashtable_bucket(struct some_hashtable *table, struct interesting_input *input)
|
||||
{
|
||||
return &table->hashtable[siphash(input, sizeof(*input), &table->key) & (HASH_SIZE(table->hashtable) - 1)];
|
||||
}
|
||||
|
||||
You may then iterate like usual over the returned hash bucket.
|
||||
|
||||
4. Security
|
||||
|
||||
SipHash has a very high security margin, with its 128-bit key. So long as the
|
||||
key is kept secret, it is impossible for an attacker to guess the outputs of
|
||||
the function, even if being able to observe many outputs, since 2^128 outputs
|
||||
is significant.
|
||||
|
||||
Linux implements the "2-4" variant of SipHash.
|
||||
|
||||
5. Struct-passing Pitfalls
|
||||
|
||||
Often times the XuY functions will not be large enough, and instead you'll
|
||||
want to pass a pre-filled struct to siphash. When doing this, it's important
|
||||
to always ensure the struct has no padding holes. The easiest way to do this
|
||||
is to simply arrange the members of the struct in descending order of size,
|
||||
and to use offsetendof() instead of sizeof() for getting the size. For
|
||||
performance reasons, if possible, it's probably a good thing to align the
|
||||
struct to the right boundary. Here's an example:
|
||||
|
||||
const struct {
|
||||
struct in6_addr saddr;
|
||||
u32 counter;
|
||||
u16 dport;
|
||||
} __aligned(SIPHASH_ALIGNMENT) combined = {
|
||||
.saddr = *(struct in6_addr *)saddr,
|
||||
.counter = counter,
|
||||
.dport = dport
|
||||
};
|
||||
u64 h = siphash(&combined, offsetofend(typeof(combined), dport), &secret);
|
||||
|
||||
6. Resources
|
||||
|
||||
Read the SipHash paper if you're interested in learning more:
|
||||
https://131002.net/siphash/siphash.pdf
|
||||
|
||||
|
||||
~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~
|
||||
|
||||
HalfSipHash - SipHash's insecure younger cousin
|
||||
-----------------------------------------------
|
||||
Written by Jason A. Donenfeld <jason@zx2c4.com>
|
||||
|
||||
On the off-chance that SipHash is not fast enough for your needs, you might be
|
||||
able to justify using HalfSipHash, a terrifying but potentially useful
|
||||
possibility. HalfSipHash cuts SipHash's rounds down from "2-4" to "1-3" and,
|
||||
even scarier, uses an easily brute-forcable 64-bit key (with a 32-bit output)
|
||||
instead of SipHash's 128-bit key. However, this may appeal to some
|
||||
high-performance `jhash` users.
|
||||
|
||||
Danger!
|
||||
|
||||
Do not ever use HalfSipHash except for as a hashtable key function, and only
|
||||
then when you can be absolutely certain that the outputs will never be
|
||||
transmitted out of the kernel. This is only remotely useful over `jhash` as a
|
||||
means of mitigating hashtable flooding denial of service attacks.
|
||||
|
||||
1. Generating a key
|
||||
|
||||
Keys should always be generated from a cryptographically secure source of
|
||||
random numbers, either using get_random_bytes or get_random_once:
|
||||
|
||||
hsiphash_key_t key;
|
||||
get_random_bytes(&key, sizeof(key));
|
||||
|
||||
If you're not deriving your key from here, you're doing it wrong.
|
||||
|
||||
2. Using the functions
|
||||
|
||||
There are two variants of the function, one that takes a list of integers, and
|
||||
one that takes a buffer:
|
||||
|
||||
u32 hsiphash(const void *data, size_t len, const hsiphash_key_t *key);
|
||||
|
||||
And:
|
||||
|
||||
u32 hsiphash_1u32(u32, const hsiphash_key_t *key);
|
||||
u32 hsiphash_2u32(u32, u32, const hsiphash_key_t *key);
|
||||
u32 hsiphash_3u32(u32, u32, u32, const hsiphash_key_t *key);
|
||||
u32 hsiphash_4u32(u32, u32, u32, u32, const hsiphash_key_t *key);
|
||||
|
||||
If you pass the generic hsiphash function something of a constant length, it
|
||||
will constant fold at compile-time and automatically choose one of the
|
||||
optimized functions.
|
||||
|
||||
3. Hashtable key function usage:
|
||||
|
||||
struct some_hashtable {
|
||||
DECLARE_HASHTABLE(hashtable, 8);
|
||||
hsiphash_key_t key;
|
||||
};
|
||||
|
||||
void init_hashtable(struct some_hashtable *table)
|
||||
{
|
||||
get_random_bytes(&table->key, sizeof(table->key));
|
||||
}
|
||||
|
||||
static inline hlist_head *some_hashtable_bucket(struct some_hashtable *table, struct interesting_input *input)
|
||||
{
|
||||
return &table->hashtable[hsiphash(input, sizeof(*input), &table->key) & (HASH_SIZE(table->hashtable) - 1)];
|
||||
}
|
||||
|
||||
You may then iterate like usual over the returned hash bucket.
|
||||
|
||||
4. Performance
|
||||
|
||||
HalfSipHash is roughly 3 times slower than JenkinsHash. For many replacements,
|
||||
this will not be a problem, as the hashtable lookup isn't the bottleneck. And
|
||||
in general, this is probably a good sacrifice to make for the security and DoS
|
||||
resistance of HalfSipHash.
|
|
@ -92,3 +92,12 @@ Speculation misfeature controls
|
|||
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_ENABLE, 0, 0);
|
||||
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_DISABLE, 0, 0);
|
||||
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_FORCE_DISABLE, 0, 0);
|
||||
|
||||
- PR_SPEC_INDIR_BRANCH: Indirect Branch Speculation in User Processes
|
||||
(Mitigate Spectre V2 style attacks against user processes)
|
||||
|
||||
Invocations:
|
||||
* prctl(PR_GET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, 0, 0, 0);
|
||||
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_ENABLE, 0, 0);
|
||||
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_DISABLE, 0, 0);
|
||||
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_FORCE_DISABLE, 0, 0);
|
||||
|
|
|
@ -34,7 +34,9 @@ Currently, these files are in /proc/sys/fs:
|
|||
- overflowgid
|
||||
- pipe-user-pages-hard
|
||||
- pipe-user-pages-soft
|
||||
- protected_fifos
|
||||
- protected_hardlinks
|
||||
- protected_regular
|
||||
- protected_symlinks
|
||||
- suid_dumpable
|
||||
- super-max
|
||||
|
@ -182,6 +184,24 @@ applied.
|
|||
|
||||
==============================================================
|
||||
|
||||
protected_fifos:
|
||||
|
||||
The intent of this protection is to avoid unintentional writes to
|
||||
an attacker-controlled FIFO, where a program expected to create a regular
|
||||
file.
|
||||
|
||||
When set to "0", writing to FIFOs is unrestricted.
|
||||
|
||||
When set to "1" don't allow O_CREAT open on FIFOs that we don't own
|
||||
in world writable sticky directories, unless they are owned by the
|
||||
owner of the directory.
|
||||
|
||||
When set to "2" it also applies to group writable sticky directories.
|
||||
|
||||
This protection is based on the restrictions in Openwall.
|
||||
|
||||
==============================================================
|
||||
|
||||
protected_hardlinks:
|
||||
|
||||
A long-standing class of security issues is the hardlink-based
|
||||
|
@ -202,6 +222,22 @@ This protection is based on the restrictions in Openwall and grsecurity.
|
|||
|
||||
==============================================================
|
||||
|
||||
protected_regular:
|
||||
|
||||
This protection is similar to protected_fifos, but it
|
||||
avoids writes to an attacker-controlled regular file, where a program
|
||||
expected to create one.
|
||||
|
||||
When set to "0", writing to regular files is unrestricted.
|
||||
|
||||
When set to "1" don't allow O_CREAT open on regular files that we
|
||||
don't own in world writable sticky directories, unless they are
|
||||
owned by the owner of the directory.
|
||||
|
||||
When set to "2" it also applies to group writable sticky directories.
|
||||
|
||||
==============================================================
|
||||
|
||||
protected_symlinks:
|
||||
|
||||
A long-standing class of security issues is the symlink-based
|
||||
|
|
|
@ -54,6 +54,14 @@ Values :
|
|||
1 - enable JIT hardening for unprivileged users only
|
||||
2 - enable JIT hardening for all users
|
||||
|
||||
bpf_jit_limit
|
||||
-------------
|
||||
|
||||
This enforces a global limit for memory allocations to the BPF JIT
|
||||
compiler in order to reject unprivileged JIT requests once it has
|
||||
been surpassed. bpf_jit_limit contains the value of the global limit
|
||||
in bytes.
|
||||
|
||||
dev_weight
|
||||
--------------
|
||||
|
||||
|
|
|
@ -365,11 +365,15 @@ autosuspend the interface's device. When the usage counter is = 0
|
|||
then the interface is considered to be idle, and the kernel may
|
||||
autosuspend the device.
|
||||
|
||||
Drivers need not be concerned about balancing changes to the usage
|
||||
counter; the USB core will undo any remaining "get"s when a driver
|
||||
is unbound from its interface. As a corollary, drivers must not call
|
||||
any of the usb_autopm_* functions after their disconnect() routine has
|
||||
returned.
|
||||
Drivers must be careful to balance their overall changes to the usage
|
||||
counter. Unbalanced "get"s will remain in effect when a driver is
|
||||
unbound from its interface, preventing the device from going into
|
||||
runtime suspend should the interface be bound to a driver again. On
|
||||
the other hand, drivers are allowed to achieve this balance by calling
|
||||
the ``usb_autopm_*`` functions even after their ``disconnect`` routine
|
||||
has returned -- say from within a work-queue routine -- provided they
|
||||
retain an active reference to the interface (via ``usb_get_intf`` and
|
||||
``usb_put_intf``).
|
||||
|
||||
Drivers using the async routines are responsible for their own
|
||||
synchronization and mutual exclusion.
|
||||
|
|
|
@ -1,138 +0,0 @@
|
|||
Copyright (C) 1999, 2000 Bruce Tenison
|
||||
Portions Copyright (C) 1999, 2000 David Nelson
|
||||
Thanks to David Nelson for guidance and the usage of the scanner.txt
|
||||
and scanner.c files to model our driver and this informative file.
|
||||
|
||||
Mar. 2, 2000
|
||||
|
||||
CHANGES
|
||||
|
||||
- Initial Revision
|
||||
|
||||
|
||||
OVERVIEW
|
||||
|
||||
This README will address issues regarding how to configure the kernel
|
||||
to access a RIO 500 mp3 player.
|
||||
Before I explain how to use this to access the Rio500 please be warned:
|
||||
|
||||
W A R N I N G:
|
||||
--------------
|
||||
|
||||
Please note that this software is still under development. The authors
|
||||
are in no way responsible for any damage that may occur, no matter how
|
||||
inconsequential.
|
||||
|
||||
It seems that the Rio has a problem when sending .mp3 with low batteries.
|
||||
I suggest when the batteries are low and you want to transfer stuff that you
|
||||
replace it with a fresh one. In my case, what happened is I lost two 16kb
|
||||
blocks (they are no longer usable to store information to it). But I don't
|
||||
know if that's normal or not; it could simply be a problem with the flash
|
||||
memory.
|
||||
|
||||
In an extreme case, I left my Rio playing overnight and the batteries wore
|
||||
down to nothing and appear to have corrupted the flash memory. My RIO
|
||||
needed to be replaced as a result. Diamond tech support is aware of the
|
||||
problem. Do NOT allow your batteries to wear down to nothing before
|
||||
changing them. It appears RIO 500 firmware does not handle low battery
|
||||
power well at all.
|
||||
|
||||
On systems with OHCI controllers, the kernel OHCI code appears to have
|
||||
power on problems with some chipsets. If you are having problems
|
||||
connecting to your RIO 500, try turning it on first and then plugging it
|
||||
into the USB cable.
|
||||
|
||||
Contact information:
|
||||
--------------------
|
||||
|
||||
The main page for the project is hosted at sourceforge.net in the following
|
||||
URL: <http://rio500.sourceforge.net>. You can also go to the project's
|
||||
sourceforge home page at: <http://sourceforge.net/projects/rio500/>.
|
||||
There is also a mailing list: rio500-users@lists.sourceforge.net
|
||||
|
||||
Authors:
|
||||
-------
|
||||
|
||||
Most of the code was written by Cesar Miquel <miquel@df.uba.ar>. Keith
|
||||
Clayton <kclayton@jps.net> is incharge of the PPC port and making sure
|
||||
things work there. Bruce Tenison <btenison@dibbs.net> is adding support
|
||||
for .fon files and also does testing. The program will mostly sure be
|
||||
re-written and Pete Ikusz along with the rest will re-design it. I would
|
||||
also like to thank Tri Nguyen <tmn_3022000@hotmail.com> who provided use
|
||||
with some important information regarding the communication with the Rio.
|
||||
|
||||
ADDITIONAL INFORMATION and Userspace tools
|
||||
|
||||
http://rio500.sourceforge.net/
|
||||
|
||||
|
||||
REQUIREMENTS
|
||||
|
||||
A host with a USB port. Ideally, either a UHCI (Intel) or OHCI
|
||||
(Compaq and others) hardware port should work.
|
||||
|
||||
A Linux development kernel (2.3.x) with USB support enabled or a
|
||||
backported version to linux-2.2.x. See http://www.linux-usb.org for
|
||||
more information on accomplishing this.
|
||||
|
||||
A Linux kernel with RIO 500 support enabled.
|
||||
|
||||
'lspci' which is only needed to determine the type of USB hardware
|
||||
available in your machine.
|
||||
|
||||
CONFIGURATION
|
||||
|
||||
Using `lspci -v`, determine the type of USB hardware available.
|
||||
|
||||
If you see something like:
|
||||
|
||||
USB Controller: ......
|
||||
Flags: .....
|
||||
I/O ports at ....
|
||||
|
||||
Then you have a UHCI based controller.
|
||||
|
||||
If you see something like:
|
||||
|
||||
USB Controller: .....
|
||||
Flags: ....
|
||||
Memory at .....
|
||||
|
||||
Then you have a OHCI based controller.
|
||||
|
||||
Using `make menuconfig` or your preferred method for configuring the
|
||||
kernel, select 'Support for USB', 'OHCI/UHCI' depending on your
|
||||
hardware (determined from the steps above), 'USB Diamond Rio500 support', and
|
||||
'Preliminary USB device filesystem'. Compile and install the modules
|
||||
(you may need to execute `depmod -a` to update the module
|
||||
dependencies).
|
||||
|
||||
Add a device for the USB rio500:
|
||||
`mknod /dev/usb/rio500 c 180 64`
|
||||
|
||||
Set appropriate permissions for /dev/usb/rio500 (don't forget about
|
||||
group and world permissions). Both read and write permissions are
|
||||
required for proper operation.
|
||||
|
||||
Load the appropriate modules (if compiled as modules):
|
||||
|
||||
OHCI:
|
||||
modprobe usbcore
|
||||
modprobe usb-ohci
|
||||
modprobe rio500
|
||||
|
||||
UHCI:
|
||||
modprobe usbcore
|
||||
modprobe usb-uhci (or uhci)
|
||||
modprobe rio500
|
||||
|
||||
That's it. The Rio500 Utils at: http://rio500.sourceforge.net should
|
||||
be able to access the rio500.
|
||||
|
||||
BUGS
|
||||
|
||||
If you encounter any problems feel free to drop me an email.
|
||||
|
||||
Bruce Tenison
|
||||
btenison@dibbs.net
|
||||
|
|
@ -13,7 +13,7 @@ of a virtual machine. The ioctls belong to three classes
|
|||
|
||||
- VM ioctls: These query and set attributes that affect an entire virtual
|
||||
machine, for example memory layout. In addition a VM ioctl is used to
|
||||
create virtual cpus (vcpus).
|
||||
create virtual cpus (vcpus) and devices.
|
||||
|
||||
Only run VM ioctls from the same process (address space) that was used
|
||||
to create the VM.
|
||||
|
@ -24,6 +24,11 @@ of a virtual machine. The ioctls belong to three classes
|
|||
Only run vcpu ioctls from the same thread that was used to create the
|
||||
vcpu.
|
||||
|
||||
- device ioctls: These query and set attributes that control the operation
|
||||
of a single device.
|
||||
|
||||
device ioctls must be issued from the same process (address space) that
|
||||
was used to create the VM.
|
||||
|
||||
2. File descriptors
|
||||
-------------------
|
||||
|
@ -32,10 +37,11 @@ The kvm API is centered around file descriptors. An initial
|
|||
open("/dev/kvm") obtains a handle to the kvm subsystem; this handle
|
||||
can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this
|
||||
handle will create a VM file descriptor which can be used to issue VM
|
||||
ioctls. A KVM_CREATE_VCPU ioctl on a VM fd will create a virtual cpu
|
||||
and return a file descriptor pointing to it. Finally, ioctls on a vcpu
|
||||
fd can be used to control the vcpu, including the important task of
|
||||
actually running guest code.
|
||||
ioctls. A KVM_CREATE_VCPU or KVM_CREATE_DEVICE ioctl on a VM fd will
|
||||
create a virtual cpu or device and return a file descriptor pointing to
|
||||
the new resource. Finally, ioctls on a vcpu or device fd can be used
|
||||
to control the vcpu or device. For vcpus, this includes the important
|
||||
task of actually running guest code.
|
||||
|
||||
In general file descriptors can be migrated among processes by means
|
||||
of fork() and the SCM_RIGHTS facility of unix domain socket. These
|
||||
|
@ -122,14 +128,15 @@ KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as
|
|||
privileged user (CAP_SYS_ADMIN).
|
||||
|
||||
|
||||
4.3 KVM_GET_MSR_INDEX_LIST
|
||||
4.3 KVM_GET_MSR_INDEX_LIST, KVM_GET_MSR_FEATURE_INDEX_LIST
|
||||
|
||||
Capability: basic
|
||||
Capability: basic, KVM_CAP_GET_MSR_FEATURES for KVM_GET_MSR_FEATURE_INDEX_LIST
|
||||
Architectures: x86
|
||||
Type: system
|
||||
Type: system ioctl
|
||||
Parameters: struct kvm_msr_list (in/out)
|
||||
Returns: 0 on success; -1 on error
|
||||
Errors:
|
||||
EFAULT: the msr index list cannot be read from or written to
|
||||
E2BIG: the msr index list is to be to fit in the array specified by
|
||||
the user.
|
||||
|
||||
|
@ -138,16 +145,23 @@ struct kvm_msr_list {
|
|||
__u32 indices[0];
|
||||
};
|
||||
|
||||
This ioctl returns the guest msrs that are supported. The list varies
|
||||
by kvm version and host processor, but does not change otherwise. The
|
||||
user fills in the size of the indices array in nmsrs, and in return
|
||||
kvm adjusts nmsrs to reflect the actual number of msrs and fills in
|
||||
the indices array with their numbers.
|
||||
The user fills in the size of the indices array in nmsrs, and in return
|
||||
kvm adjusts nmsrs to reflect the actual number of msrs and fills in the
|
||||
indices array with their numbers.
|
||||
|
||||
KVM_GET_MSR_INDEX_LIST returns the guest msrs that are supported. The list
|
||||
varies by kvm version and host processor, but does not change otherwise.
|
||||
|
||||
Note: if kvm indicates supports MCE (KVM_CAP_MCE), then the MCE bank MSRs are
|
||||
not returned in the MSR list, as different vcpus can have a different number
|
||||
of banks, as set via the KVM_X86_SETUP_MCE ioctl.
|
||||
|
||||
KVM_GET_MSR_FEATURE_INDEX_LIST returns the list of MSRs that can be passed
|
||||
to the KVM_GET_MSRS system ioctl. This lets userspace probe host capabilities
|
||||
and processor features that are exposed via MSRs (e.g., VMX capabilities).
|
||||
This list also varies by kvm version and host processor, but does not change
|
||||
otherwise.
|
||||
|
||||
|
||||
4.4 KVM_CHECK_EXTENSION
|
||||
|
||||
|
@ -474,14 +488,22 @@ Support for this has been removed. Use KVM_SET_GUEST_DEBUG instead.
|
|||
|
||||
4.18 KVM_GET_MSRS
|
||||
|
||||
Capability: basic
|
||||
Capability: basic (vcpu), KVM_CAP_GET_MSR_FEATURES (system)
|
||||
Architectures: x86
|
||||
Type: vcpu ioctl
|
||||
Type: system ioctl, vcpu ioctl
|
||||
Parameters: struct kvm_msrs (in/out)
|
||||
Returns: 0 on success, -1 on error
|
||||
Returns: number of msrs successfully returned;
|
||||
-1 on error
|
||||
|
||||
When used as a system ioctl:
|
||||
Reads the values of MSR-based features that are available for the VM. This
|
||||
is similar to KVM_GET_SUPPORTED_CPUID, but it returns MSR indices and values.
|
||||
The list of msr-based features can be obtained using KVM_GET_MSR_FEATURE_INDEX_LIST
|
||||
in a system ioctl.
|
||||
|
||||
When used as a vcpu ioctl:
|
||||
Reads model-specific registers from the vcpu. Supported msr indices can
|
||||
be obtained using KVM_GET_MSR_INDEX_LIST.
|
||||
be obtained using KVM_GET_MSR_INDEX_LIST in a system ioctl.
|
||||
|
||||
struct kvm_msrs {
|
||||
__u32 nmsrs; /* number of msrs in entries */
|
||||
|
|
|
@ -13,8 +13,8 @@ The acquisition orders for mutexes are as follows:
|
|||
- kvm->slots_lock is taken outside kvm->irq_lock, though acquiring
|
||||
them together is quite rare.
|
||||
|
||||
For spinlocks, kvm_lock is taken outside kvm->mmu_lock. Everything
|
||||
else is a leaf: no other lock is taken inside the critical sections.
|
||||
Everything else is a leaf: no other lock is taken inside the critical
|
||||
sections.
|
||||
|
||||
2: Exception
|
||||
------------
|
||||
|
@ -142,7 +142,7 @@ See the comments in spte_has_volatile_bits() and mmu_spte_update().
|
|||
------------
|
||||
|
||||
Name: kvm_lock
|
||||
Type: spinlock_t
|
||||
Type: mutex
|
||||
Arch: any
|
||||
Protects: - vm_list
|
||||
|
||||
|
|
10
Documentation/x86/conf.py
Normal file
10
Documentation/x86/conf.py
Normal file
|
@ -0,0 +1,10 @@
|
|||
# -*- coding: utf-8; mode: python -*-
|
||||
|
||||
project = "X86 architecture specific documentation"
|
||||
|
||||
tags.add("subproject")
|
||||
|
||||
latex_documents = [
|
||||
('index', 'x86.tex', project,
|
||||
'The kernel development community', 'manual'),
|
||||
]
|
9
Documentation/x86/index.rst
Normal file
9
Documentation/x86/index.rst
Normal file
|
@ -0,0 +1,9 @@
|
|||
==========================
|
||||
x86 architecture specifics
|
||||
==========================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
mds
|
||||
tsx_async_abort
|
193
Documentation/x86/mds.rst
Normal file
193
Documentation/x86/mds.rst
Normal file
|
@ -0,0 +1,193 @@
|
|||
Microarchitectural Data Sampling (MDS) mitigation
|
||||
=================================================
|
||||
|
||||
.. _mds:
|
||||
|
||||
Overview
|
||||
--------
|
||||
|
||||
Microarchitectural Data Sampling (MDS) is a family of side channel attacks
|
||||
on internal buffers in Intel CPUs. The variants are:
|
||||
|
||||
- Microarchitectural Store Buffer Data Sampling (MSBDS) (CVE-2018-12126)
|
||||
- Microarchitectural Fill Buffer Data Sampling (MFBDS) (CVE-2018-12130)
|
||||
- Microarchitectural Load Port Data Sampling (MLPDS) (CVE-2018-12127)
|
||||
- Microarchitectural Data Sampling Uncacheable Memory (MDSUM) (CVE-2019-11091)
|
||||
|
||||
MSBDS leaks Store Buffer Entries which can be speculatively forwarded to a
|
||||
dependent load (store-to-load forwarding) as an optimization. The forward
|
||||
can also happen to a faulting or assisting load operation for a different
|
||||
memory address, which can be exploited under certain conditions. Store
|
||||
buffers are partitioned between Hyper-Threads so cross thread forwarding is
|
||||
not possible. But if a thread enters or exits a sleep state the store
|
||||
buffer is repartitioned which can expose data from one thread to the other.
|
||||
|
||||
MFBDS leaks Fill Buffer Entries. Fill buffers are used internally to manage
|
||||
L1 miss situations and to hold data which is returned or sent in response
|
||||
to a memory or I/O operation. Fill buffers can forward data to a load
|
||||
operation and also write data to the cache. When the fill buffer is
|
||||
deallocated it can retain the stale data of the preceding operations which
|
||||
can then be forwarded to a faulting or assisting load operation, which can
|
||||
be exploited under certain conditions. Fill buffers are shared between
|
||||
Hyper-Threads so cross thread leakage is possible.
|
||||
|
||||
MLPDS leaks Load Port Data. Load ports are used to perform load operations
|
||||
from memory or I/O. The received data is then forwarded to the register
|
||||
file or a subsequent operation. In some implementations the Load Port can
|
||||
contain stale data from a previous operation which can be forwarded to
|
||||
faulting or assisting loads under certain conditions, which again can be
|
||||
exploited eventually. Load ports are shared between Hyper-Threads so cross
|
||||
thread leakage is possible.
|
||||
|
||||
MDSUM is a special case of MSBDS, MFBDS and MLPDS. An uncacheable load from
|
||||
memory that takes a fault or assist can leave data in a microarchitectural
|
||||
structure that may later be observed using one of the same methods used by
|
||||
MSBDS, MFBDS or MLPDS.
|
||||
|
||||
Exposure assumptions
|
||||
--------------------
|
||||
|
||||
It is assumed that attack code resides in user space or in a guest with one
|
||||
exception. The rationale behind this assumption is that the code construct
|
||||
needed for exploiting MDS requires:
|
||||
|
||||
- to control the load to trigger a fault or assist
|
||||
|
||||
- to have a disclosure gadget which exposes the speculatively accessed
|
||||
data for consumption through a side channel.
|
||||
|
||||
- to control the pointer through which the disclosure gadget exposes the
|
||||
data
|
||||
|
||||
The existence of such a construct in the kernel cannot be excluded with
|
||||
100% certainty, but the complexity involved makes it extremly unlikely.
|
||||
|
||||
There is one exception, which is untrusted BPF. The functionality of
|
||||
untrusted BPF is limited, but it needs to be thoroughly investigated
|
||||
whether it can be used to create such a construct.
|
||||
|
||||
|
||||
Mitigation strategy
|
||||
-------------------
|
||||
|
||||
All variants have the same mitigation strategy at least for the single CPU
|
||||
thread case (SMT off): Force the CPU to clear the affected buffers.
|
||||
|
||||
This is achieved by using the otherwise unused and obsolete VERW
|
||||
instruction in combination with a microcode update. The microcode clears
|
||||
the affected CPU buffers when the VERW instruction is executed.
|
||||
|
||||
For virtualization there are two ways to achieve CPU buffer
|
||||
clearing. Either the modified VERW instruction or via the L1D Flush
|
||||
command. The latter is issued when L1TF mitigation is enabled so the extra
|
||||
VERW can be avoided. If the CPU is not affected by L1TF then VERW needs to
|
||||
be issued.
|
||||
|
||||
If the VERW instruction with the supplied segment selector argument is
|
||||
executed on a CPU without the microcode update there is no side effect
|
||||
other than a small number of pointlessly wasted CPU cycles.
|
||||
|
||||
This does not protect against cross Hyper-Thread attacks except for MSBDS
|
||||
which is only exploitable cross Hyper-thread when one of the Hyper-Threads
|
||||
enters a C-state.
|
||||
|
||||
The kernel provides a function to invoke the buffer clearing:
|
||||
|
||||
mds_clear_cpu_buffers()
|
||||
|
||||
The mitigation is invoked on kernel/userspace, hypervisor/guest and C-state
|
||||
(idle) transitions.
|
||||
|
||||
As a special quirk to address virtualization scenarios where the host has
|
||||
the microcode updated, but the hypervisor does not (yet) expose the
|
||||
MD_CLEAR CPUID bit to guests, the kernel issues the VERW instruction in the
|
||||
hope that it might actually clear the buffers. The state is reflected
|
||||
accordingly.
|
||||
|
||||
According to current knowledge additional mitigations inside the kernel
|
||||
itself are not required because the necessary gadgets to expose the leaked
|
||||
data cannot be controlled in a way which allows exploitation from malicious
|
||||
user space or VM guests.
|
||||
|
||||
Kernel internal mitigation modes
|
||||
--------------------------------
|
||||
|
||||
======= ============================================================
|
||||
off Mitigation is disabled. Either the CPU is not affected or
|
||||
mds=off is supplied on the kernel command line
|
||||
|
||||
full Mitigation is enabled. CPU is affected and MD_CLEAR is
|
||||
advertised in CPUID.
|
||||
|
||||
vmwerv Mitigation is enabled. CPU is affected and MD_CLEAR is not
|
||||
advertised in CPUID. That is mainly for virtualization
|
||||
scenarios where the host has the updated microcode but the
|
||||
hypervisor does not expose MD_CLEAR in CPUID. It's a best
|
||||
effort approach without guarantee.
|
||||
======= ============================================================
|
||||
|
||||
If the CPU is affected and mds=off is not supplied on the kernel command
|
||||
line then the kernel selects the appropriate mitigation mode depending on
|
||||
the availability of the MD_CLEAR CPUID bit.
|
||||
|
||||
Mitigation points
|
||||
-----------------
|
||||
|
||||
1. Return to user space
|
||||
^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
When transitioning from kernel to user space the CPU buffers are flushed
|
||||
on affected CPUs when the mitigation is not disabled on the kernel
|
||||
command line. The migitation is enabled through the static key
|
||||
mds_user_clear.
|
||||
|
||||
The mitigation is invoked in prepare_exit_to_usermode() which covers
|
||||
all but one of the kernel to user space transitions. The exception
|
||||
is when we return from a Non Maskable Interrupt (NMI), which is
|
||||
handled directly in do_nmi().
|
||||
|
||||
(The reason that NMI is special is that prepare_exit_to_usermode() can
|
||||
enable IRQs. In NMI context, NMIs are blocked, and we don't want to
|
||||
enable IRQs with NMIs blocked.)
|
||||
|
||||
|
||||
2. C-State transition
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
When a CPU goes idle and enters a C-State the CPU buffers need to be
|
||||
cleared on affected CPUs when SMT is active. This addresses the
|
||||
repartitioning of the store buffer when one of the Hyper-Threads enters
|
||||
a C-State.
|
||||
|
||||
When SMT is inactive, i.e. either the CPU does not support it or all
|
||||
sibling threads are offline CPU buffer clearing is not required.
|
||||
|
||||
The idle clearing is enabled on CPUs which are only affected by MSBDS
|
||||
and not by any other MDS variant. The other MDS variants cannot be
|
||||
protected against cross Hyper-Thread attacks because the Fill Buffer and
|
||||
the Load Ports are shared. So on CPUs affected by other variants, the
|
||||
idle clearing would be a window dressing exercise and is therefore not
|
||||
activated.
|
||||
|
||||
The invocation is controlled by the static key mds_idle_clear which is
|
||||
switched depending on the chosen mitigation mode and the SMT state of
|
||||
the system.
|
||||
|
||||
The buffer clear is only invoked before entering the C-State to prevent
|
||||
that stale data from the idling CPU from spilling to the Hyper-Thread
|
||||
sibling after the store buffer got repartitioned and all entries are
|
||||
available to the non idle sibling.
|
||||
|
||||
When coming out of idle the store buffer is partitioned again so each
|
||||
sibling has half of it available. The back from idle CPU could be then
|
||||
speculatively exposed to contents of the sibling. The buffers are
|
||||
flushed either on exit to user space or on VMENTER so malicious code
|
||||
in user space or the guest cannot speculatively access them.
|
||||
|
||||
The mitigation is hooked into all variants of halt()/mwait(), but does
|
||||
not cover the legacy ACPI IO-Port mechanism because the ACPI idle driver
|
||||
has been superseded by the intel_idle driver around 2010 and is
|
||||
preferred on all affected CPUs which are expected to gain the MD_CLEAR
|
||||
functionality in microcode. Aside of that the IO-Port mechanism is a
|
||||
legacy interface which is only used on older systems which are either
|
||||
not affected or do not receive microcode updates anymore.
|
117
Documentation/x86/tsx_async_abort.rst
Normal file
117
Documentation/x86/tsx_async_abort.rst
Normal file
|
@ -0,0 +1,117 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
TSX Async Abort (TAA) mitigation
|
||||
================================
|
||||
|
||||
.. _tsx_async_abort:
|
||||
|
||||
Overview
|
||||
--------
|
||||
|
||||
TSX Async Abort (TAA) is a side channel attack on internal buffers in some
|
||||
Intel processors similar to Microachitectural Data Sampling (MDS). In this
|
||||
case certain loads may speculatively pass invalid data to dependent operations
|
||||
when an asynchronous abort condition is pending in a Transactional
|
||||
Synchronization Extensions (TSX) transaction. This includes loads with no
|
||||
fault or assist condition. Such loads may speculatively expose stale data from
|
||||
the same uarch data structures as in MDS, with same scope of exposure i.e.
|
||||
same-thread and cross-thread. This issue affects all current processors that
|
||||
support TSX.
|
||||
|
||||
Mitigation strategy
|
||||
-------------------
|
||||
|
||||
a) TSX disable - one of the mitigations is to disable TSX. A new MSR
|
||||
IA32_TSX_CTRL will be available in future and current processors after
|
||||
microcode update which can be used to disable TSX. In addition, it
|
||||
controls the enumeration of the TSX feature bits (RTM and HLE) in CPUID.
|
||||
|
||||
b) Clear CPU buffers - similar to MDS, clearing the CPU buffers mitigates this
|
||||
vulnerability. More details on this approach can be found in
|
||||
:ref:`Documentation/hw-vuln/mds.rst <mds>`.
|
||||
|
||||
Kernel internal mitigation modes
|
||||
--------------------------------
|
||||
|
||||
============= ============================================================
|
||||
off Mitigation is disabled. Either the CPU is not affected or
|
||||
tsx_async_abort=off is supplied on the kernel command line.
|
||||
|
||||
tsx disabled Mitigation is enabled. TSX feature is disabled by default at
|
||||
bootup on processors that support TSX control.
|
||||
|
||||
verw Mitigation is enabled. CPU is affected and MD_CLEAR is
|
||||
advertised in CPUID.
|
||||
|
||||
ucode needed Mitigation is enabled. CPU is affected and MD_CLEAR is not
|
||||
advertised in CPUID. That is mainly for virtualization
|
||||
scenarios where the host has the updated microcode but the
|
||||
hypervisor does not expose MD_CLEAR in CPUID. It's a best
|
||||
effort approach without guarantee.
|
||||
============= ============================================================
|
||||
|
||||
If the CPU is affected and the "tsx_async_abort" kernel command line parameter is
|
||||
not provided then the kernel selects an appropriate mitigation depending on the
|
||||
status of RTM and MD_CLEAR CPUID bits.
|
||||
|
||||
Below tables indicate the impact of tsx=on|off|auto cmdline options on state of
|
||||
TAA mitigation, VERW behavior and TSX feature for various combinations of
|
||||
MSR_IA32_ARCH_CAPABILITIES bits.
|
||||
|
||||
1. "tsx=off"
|
||||
|
||||
========= ========= ============ ============ ============== =================== ======================
|
||||
MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=off
|
||||
---------------------------------- -------------------------------------------------------------------------
|
||||
TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation
|
||||
after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full
|
||||
========= ========= ============ ============ ============== =================== ======================
|
||||
0 0 0 HW default Yes Same as MDS Same as MDS
|
||||
0 0 1 Invalid case Invalid case Invalid case Invalid case
|
||||
0 1 0 HW default No Need ucode update Need ucode update
|
||||
0 1 1 Disabled Yes TSX disabled TSX disabled
|
||||
1 X 1 Disabled X None needed None needed
|
||||
========= ========= ============ ============ ============== =================== ======================
|
||||
|
||||
2. "tsx=on"
|
||||
|
||||
========= ========= ============ ============ ============== =================== ======================
|
||||
MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=on
|
||||
---------------------------------- -------------------------------------------------------------------------
|
||||
TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation
|
||||
after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full
|
||||
========= ========= ============ ============ ============== =================== ======================
|
||||
0 0 0 HW default Yes Same as MDS Same as MDS
|
||||
0 0 1 Invalid case Invalid case Invalid case Invalid case
|
||||
0 1 0 HW default No Need ucode update Need ucode update
|
||||
0 1 1 Enabled Yes None Same as MDS
|
||||
1 X 1 Enabled X None needed None needed
|
||||
========= ========= ============ ============ ============== =================== ======================
|
||||
|
||||
3. "tsx=auto"
|
||||
|
||||
========= ========= ============ ============ ============== =================== ======================
|
||||
MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=auto
|
||||
---------------------------------- -------------------------------------------------------------------------
|
||||
TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation
|
||||
after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full
|
||||
========= ========= ============ ============ ============== =================== ======================
|
||||
0 0 0 HW default Yes Same as MDS Same as MDS
|
||||
0 0 1 Invalid case Invalid case Invalid case Invalid case
|
||||
0 1 0 HW default No Need ucode update Need ucode update
|
||||
0 1 1 Disabled Yes TSX disabled TSX disabled
|
||||
1 X 1 Enabled X None needed None needed
|
||||
========= ========= ============ ============ ============== =================== ======================
|
||||
|
||||
In the tables, TSX_CTRL_MSR is a new bit in MSR_IA32_ARCH_CAPABILITIES that
|
||||
indicates whether MSR_IA32_TSX_CTRL is supported.
|
||||
|
||||
There are two control bits in IA32_TSX_CTRL MSR:
|
||||
|
||||
Bit 0: When set it disables the Restricted Transactional Memory (RTM)
|
||||
sub-feature of TSX (will force all transactions to abort on the
|
||||
XBEGIN instruction).
|
||||
|
||||
Bit 1: When set it disables the enumeration of the RTM and HLE feature
|
||||
(i.e. it will make CPUID(EAX=7).EBX{bit4} and
|
||||
CPUID(EAX=7).EBX{bit11} read as 0).
|
15
MAINTAINERS
15
MAINTAINERS
|
@ -11081,6 +11081,13 @@ F: arch/arm/mach-s3c24xx/mach-bast.c
|
|||
F: arch/arm/mach-s3c24xx/bast-ide.c
|
||||
F: arch/arm/mach-s3c24xx/bast-irq.c
|
||||
|
||||
SIPHASH PRF ROUTINES
|
||||
M: Jason A. Donenfeld <Jason@zx2c4.com>
|
||||
S: Maintained
|
||||
F: lib/siphash.c
|
||||
F: lib/test_siphash.c
|
||||
F: include/linux/siphash.h
|
||||
|
||||
TI DAVINCI MACHINE SUPPORT
|
||||
M: Sekhar Nori <nsekhar@ti.com>
|
||||
M: Kevin Hilman <khilman@kernel.org>
|
||||
|
@ -11482,6 +11489,7 @@ F: arch/alpha/kernel/srm_env.c
|
|||
|
||||
STABLE BRANCH
|
||||
M: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
||||
M: Sasha Levin <sashal@kernel.org>
|
||||
L: stable@vger.kernel.org
|
||||
S: Supported
|
||||
F: Documentation/stable_kernel_rules.txt
|
||||
|
@ -12478,13 +12486,6 @@ W: http://www.linux-usb.org/usbnet
|
|||
S: Maintained
|
||||
F: drivers/net/usb/dm9601.c
|
||||
|
||||
USB DIAMOND RIO500 DRIVER
|
||||
M: Cesar Miquel <miquel@df.uba.ar>
|
||||
L: rio500-users@lists.sourceforge.net
|
||||
W: http://rio500.sourceforge.net
|
||||
S: Maintained
|
||||
F: drivers/usb/misc/rio500*
|
||||
|
||||
USB EHCI DRIVER
|
||||
M: Alan Stern <stern@rowland.harvard.edu>
|
||||
L: linux-usb@vger.kernel.org
|
||||
|
|
82
Makefile
82
Makefile
|
@ -1,6 +1,6 @@
|
|||
VERSION = 4
|
||||
PATCHLEVEL = 9
|
||||
SUBLEVEL = 118
|
||||
SUBLEVEL = 212
|
||||
EXTRAVERSION =
|
||||
NAME = Roaring Lionus
|
||||
|
||||
|
@ -309,11 +309,6 @@ HOSTCXX = g++
|
|||
HOSTCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89
|
||||
HOSTCXXFLAGS = -O2
|
||||
|
||||
ifeq ($(shell $(HOSTCC) -v 2>&1 | grep -c "clang version"), 1)
|
||||
HOSTCFLAGS += -Wno-unused-value -Wno-unused-parameter \
|
||||
-Wno-missing-field-initializers -fno-delete-null-pointer-checks
|
||||
endif
|
||||
|
||||
# Decide whether to build built-in, modular, or both.
|
||||
# Normally, just do built-in.
|
||||
|
||||
|
@ -398,7 +393,7 @@ LINUXINCLUDE += $(filter-out $(LINUXINCLUDE),$(USERINCLUDE))
|
|||
|
||||
KBUILD_AFLAGS := -D__ASSEMBLY__
|
||||
KBUILD_CFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
|
||||
-fno-strict-aliasing -fno-common \
|
||||
-fno-strict-aliasing -fno-common -fshort-wchar \
|
||||
-Werror-implicit-function-declaration \
|
||||
-Wno-format-security \
|
||||
-Werror \
|
||||
|
@ -410,6 +405,7 @@ KBUILD_AFLAGS_MODULE := -DMODULE
|
|||
KBUILD_CFLAGS_MODULE := -DMODULE
|
||||
KBUILD_LDFLAGS_MODULE := -T $(srctree)/scripts/module-common.lds
|
||||
GCC_PLUGINS_CFLAGS :=
|
||||
CLANG_FLAGS :=
|
||||
|
||||
# Read KERNELRELEASE from include/config/kernel.release (if it exists)
|
||||
KERNELRELEASE = $(shell cat include/config/kernel.release 2> /dev/null)
|
||||
|
@ -422,7 +418,8 @@ export MAKE AWK GENKSYMS INSTALLKERNEL PERL PYTHON UTS_MACHINE
|
|||
export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
|
||||
|
||||
export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
|
||||
export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_KASAN CFLAGS_UBSAN
|
||||
export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE
|
||||
export CFLAGS_KASAN CFLAGS_KASAN_NOSANITIZE CFLAGS_UBSAN
|
||||
export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE
|
||||
export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE
|
||||
export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL
|
||||
|
@ -533,6 +530,27 @@ ifneq ($(filter install,$(MAKECMDGOALS)),)
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(cc-name),clang)
|
||||
ifneq ($(CROSS_COMPILE),)
|
||||
CLANG_TRIPLE ?= $(CROSS_COMPILE)
|
||||
CLANG_FLAGS += --target=$(notdir $(CLANG_TRIPLE:%-=%))
|
||||
ifeq ($(shell $(srctree)/scripts/clang-android.sh $(CC) $(CLANG_FLAGS)), y)
|
||||
$(error "Clang with Android --target detected. Did you specify CLANG_TRIPLE?")
|
||||
endif
|
||||
GCC_TOOLCHAIN_DIR := $(dir $(shell which $(CROSS_COMPILE)elfedit))
|
||||
CLANG_FLAGS += --prefix=$(GCC_TOOLCHAIN_DIR)
|
||||
GCC_TOOLCHAIN := $(realpath $(GCC_TOOLCHAIN_DIR)/..)
|
||||
endif
|
||||
ifneq ($(GCC_TOOLCHAIN),)
|
||||
CLANG_FLAGS += --gcc-toolchain=$(GCC_TOOLCHAIN)
|
||||
endif
|
||||
CLANG_FLAGS += -no-integrated-as
|
||||
CLANG_FLAGS += -Werror=unknown-warning-option
|
||||
KBUILD_CFLAGS += $(CLANG_FLAGS)
|
||||
KBUILD_AFLAGS += $(CLANG_FLAGS)
|
||||
endif
|
||||
|
||||
|
||||
ifeq ($(mixed-targets),1)
|
||||
# ===========================================================================
|
||||
# We're called with mixed targets (*config and build targets).
|
||||
|
@ -674,6 +692,7 @@ KBUILD_CFLAGS += $(call cc-disable-warning,frame-address,)
|
|||
KBUILD_CFLAGS += $(call cc-disable-warning, format-truncation)
|
||||
KBUILD_CFLAGS += $(call cc-disable-warning, format-overflow)
|
||||
KBUILD_CFLAGS += $(call cc-disable-warning, int-in-bool-context)
|
||||
KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
|
||||
KBUILD_CFLAGS += $(call cc-disable-warning, attribute-alias)
|
||||
|
||||
ifdef CONFIG_LD_DEAD_CODE_DATA_ELIMINATION
|
||||
|
@ -729,8 +748,7 @@ export DISABLE_CFI
|
|||
endif
|
||||
|
||||
ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
|
||||
KBUILD_CFLAGS += $(call cc-option,-Oz,-Os)
|
||||
KBUILD_CFLAGS += $(call cc-disable-warning,maybe-uninitialized,)
|
||||
KBUILD_CFLAGS += -Os $(call cc-disable-warning,maybe-uninitialized,)
|
||||
else
|
||||
ifdef CONFIG_PROFILE_ALL_BRANCHES
|
||||
KBUILD_CFLAGS += -O2 $(call cc-disable-warning,maybe-uninitialized,)
|
||||
|
@ -790,21 +808,9 @@ endif
|
|||
KBUILD_CFLAGS += $(stackp-flag)
|
||||
|
||||
ifeq ($(cc-name),clang)
|
||||
ifneq ($(CROSS_COMPILE),)
|
||||
CLANG_TRIPLE ?= $(CROSS_COMPILE)
|
||||
CLANG_TARGET := --target=$(notdir $(CLANG_TRIPLE:%-=%))
|
||||
GCC_TOOLCHAIN := $(realpath $(dir $(shell which $(LD)))/..)
|
||||
endif
|
||||
ifneq ($(GCC_TOOLCHAIN),)
|
||||
CLANG_GCC_TC := --gcc-toolchain=$(GCC_TOOLCHAIN)
|
||||
endif
|
||||
KBUILD_CFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
|
||||
KBUILD_AFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
|
||||
KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,)
|
||||
KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable)
|
||||
KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier)
|
||||
KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
|
||||
KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
|
||||
KBUILD_CFLAGS += $(call cc-disable-warning, duplicate-decl-specifier)
|
||||
# Quiet clang warning: comparison of unsigned expression < 0 is always false
|
||||
KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare)
|
||||
|
@ -813,16 +819,14 @@ KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare)
|
|||
# See modpost pattern 2
|
||||
KBUILD_CFLAGS += $(call cc-option, -mno-global-merge,)
|
||||
KBUILD_CFLAGS += $(call cc-option, -fcatch-undefined-behavior)
|
||||
KBUILD_CFLAGS += $(call cc-option, -no-integrated-as)
|
||||
KBUILD_AFLAGS += $(call cc-option, -no-integrated-as)
|
||||
else
|
||||
|
||||
# These warnings generated too much noise in a regular build.
|
||||
# Use make W=1 to enable them (see scripts/Makefile.build)
|
||||
# Use make W=1 to enable them (see scripts/Makefile.extrawarn)
|
||||
KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable)
|
||||
KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable)
|
||||
endif
|
||||
|
||||
KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable)
|
||||
ifdef CONFIG_FRAME_POINTER
|
||||
KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls
|
||||
else
|
||||
|
@ -894,6 +898,9 @@ KBUILD_CFLAGS += $(call cc-option,-Wdeclaration-after-statement,)
|
|||
# disable pointer signed / unsigned warnings in gcc 4.0
|
||||
KBUILD_CFLAGS += $(call cc-disable-warning, pointer-sign)
|
||||
|
||||
# disable stringop warnings in gcc 8+
|
||||
KBUILD_CFLAGS += $(call cc-disable-warning, stringop-truncation)
|
||||
|
||||
# disable invalid "can't wrap" optimizations for signed / pointers
|
||||
KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow)
|
||||
|
||||
|
@ -924,6 +931,18 @@ KBUILD_CFLAGS += $(call cc-option,-Werror=date-time)
|
|||
# enforce correct pointer usage
|
||||
KBUILD_CFLAGS += $(call cc-option,-Werror=incompatible-pointer-types)
|
||||
|
||||
# Require designated initializers for all marked structures
|
||||
KBUILD_CFLAGS += $(call cc-option,-Werror=designated-init)
|
||||
|
||||
# change __FILE__ to the relative path from the srctree
|
||||
KBUILD_CFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=)
|
||||
|
||||
# ensure -fcf-protection is disabled when using retpoline as it is
|
||||
# incompatible with -mindirect-branch=thunk-extern
|
||||
ifdef CONFIG_RETPOLINE
|
||||
KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none)
|
||||
endif
|
||||
|
||||
# use the deterministic mode of AR if available
|
||||
KBUILD_ARFLAGS := $(call ar-option,D)
|
||||
|
||||
|
@ -1628,9 +1647,6 @@ else # KBUILD_EXTMOD
|
|||
|
||||
# We are always building modules
|
||||
KBUILD_MODULES := 1
|
||||
PHONY += crmodverdir
|
||||
crmodverdir:
|
||||
$(cmd_crmodverdir)
|
||||
|
||||
PHONY += $(objtree)/Module.symvers
|
||||
$(objtree)/Module.symvers:
|
||||
|
@ -1642,7 +1658,7 @@ $(objtree)/Module.symvers:
|
|||
|
||||
module-dirs := $(addprefix _module_,$(KBUILD_EXTMOD))
|
||||
PHONY += $(module-dirs) modules
|
||||
$(module-dirs): crmodverdir $(objtree)/Module.symvers
|
||||
$(module-dirs): prepare $(objtree)/Module.symvers
|
||||
$(Q)$(MAKE) $(build)=$(patsubst _module_%,%,$@)
|
||||
|
||||
modules: $(module-dirs)
|
||||
|
@ -1683,7 +1699,8 @@ help:
|
|||
|
||||
# Dummies...
|
||||
PHONY += prepare scripts
|
||||
prepare: ;
|
||||
prepare:
|
||||
$(cmd_crmodverdir)
|
||||
scripts: ;
|
||||
endif # KBUILD_EXTMOD
|
||||
|
||||
|
@ -1809,17 +1826,14 @@ endif
|
|||
|
||||
# Modules
|
||||
/: prepare scripts FORCE
|
||||
$(cmd_crmodverdir)
|
||||
$(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \
|
||||
$(build)=$(build-dir)
|
||||
# Make sure the latest headers are built for Documentation
|
||||
Documentation/ samples/: headers_install
|
||||
%/: prepare scripts FORCE
|
||||
$(cmd_crmodverdir)
|
||||
$(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \
|
||||
$(build)=$(build-dir)
|
||||
%.ko: prepare scripts FORCE
|
||||
$(cmd_crmodverdir)
|
||||
$(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \
|
||||
$(build)=$(build-dir) $(@:.ko=.o)
|
||||
$(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modpost
|
||||
|
|
|
@ -5,6 +5,9 @@
|
|||
config KEXEC_CORE
|
||||
bool
|
||||
|
||||
config HOTPLUG_SMT
|
||||
bool
|
||||
|
||||
config OPROFILE
|
||||
tristate "OProfile system profiling"
|
||||
depends on PROFILING
|
||||
|
@ -513,6 +516,7 @@ config LTO_CLANG
|
|||
bool "Use clang Link Time Optimization (LTO) (EXPERIMENTAL)"
|
||||
depends on ARCH_SUPPORTS_LTO_CLANG
|
||||
depends on !FTRACE_MCOUNT_RECORD || HAVE_C_RECORDMCOUNT
|
||||
depends on !KASAN
|
||||
select LTO
|
||||
select THIN_ARCHIVES
|
||||
select LD_DEAD_CODE_DATA_ELIMINATION
|
||||
|
|
|
@ -55,15 +55,15 @@
|
|||
|
||||
#elif defined(CONFIG_ALPHA_DP264) || \
|
||||
defined(CONFIG_ALPHA_LYNX) || \
|
||||
defined(CONFIG_ALPHA_SHARK) || \
|
||||
defined(CONFIG_ALPHA_EIGER)
|
||||
defined(CONFIG_ALPHA_SHARK)
|
||||
# define NR_IRQS 64
|
||||
|
||||
#elif defined(CONFIG_ALPHA_TITAN)
|
||||
#define NR_IRQS 80
|
||||
|
||||
#elif defined(CONFIG_ALPHA_RAWHIDE) || \
|
||||
defined(CONFIG_ALPHA_TAKARA)
|
||||
defined(CONFIG_ALPHA_TAKARA) || \
|
||||
defined(CONFIG_ALPHA_EIGER)
|
||||
# define NR_IRQS 128
|
||||
|
||||
#elif defined(CONFIG_ALPHA_WILDFIRE)
|
||||
|
|
|
@ -72,9 +72,15 @@
|
|||
})
|
||||
|
||||
#define user_termios_to_kernel_termios(k, u) \
|
||||
copy_from_user(k, u, sizeof(struct termios))
|
||||
copy_from_user(k, u, sizeof(struct termios2))
|
||||
|
||||
#define kernel_termios_to_user_termios(u, k) \
|
||||
copy_to_user(u, k, sizeof(struct termios2))
|
||||
|
||||
#define user_termios_to_kernel_termios_1(k, u) \
|
||||
copy_from_user(k, u, sizeof(struct termios))
|
||||
|
||||
#define kernel_termios_to_user_termios_1(u, k) \
|
||||
copy_to_user(u, k, sizeof(struct termios))
|
||||
|
||||
#endif /* _ALPHA_TERMIOS_H */
|
||||
|
|
|
@ -31,6 +31,11 @@
|
|||
#define TCXONC _IO('t', 30)
|
||||
#define TCFLSH _IO('t', 31)
|
||||
|
||||
#define TCGETS2 _IOR('T', 42, struct termios2)
|
||||
#define TCSETS2 _IOW('T', 43, struct termios2)
|
||||
#define TCSETSW2 _IOW('T', 44, struct termios2)
|
||||
#define TCSETSF2 _IOW('T', 45, struct termios2)
|
||||
|
||||
#define TIOCSWINSZ _IOW('t', 103, struct winsize)
|
||||
#define TIOCGWINSZ _IOR('t', 104, struct winsize)
|
||||
#define TIOCSTART _IO('t', 110) /* start output, like ^Q */
|
||||
|
|
|
@ -25,6 +25,19 @@ struct termios {
|
|||
speed_t c_ospeed; /* output speed */
|
||||
};
|
||||
|
||||
/* Alpha has identical termios and termios2 */
|
||||
|
||||
struct termios2 {
|
||||
tcflag_t c_iflag; /* input mode flags */
|
||||
tcflag_t c_oflag; /* output mode flags */
|
||||
tcflag_t c_cflag; /* control mode flags */
|
||||
tcflag_t c_lflag; /* local mode flags */
|
||||
cc_t c_cc[NCCS]; /* control characters */
|
||||
cc_t c_line; /* line discipline (== c_cc[19]) */
|
||||
speed_t c_ispeed; /* input speed */
|
||||
speed_t c_ospeed; /* output speed */
|
||||
};
|
||||
|
||||
/* Alpha has matching termios and ktermios */
|
||||
|
||||
struct ktermios {
|
||||
|
@ -147,6 +160,7 @@ struct ktermios {
|
|||
#define B3000000 00034
|
||||
#define B3500000 00035
|
||||
#define B4000000 00036
|
||||
#define BOTHER 00037
|
||||
|
||||
#define CSIZE 00001400
|
||||
#define CS5 00000000
|
||||
|
@ -164,6 +178,9 @@ struct ktermios {
|
|||
#define CMSPAR 010000000000 /* mark or space (stick) parity */
|
||||
#define CRTSCTS 020000000000 /* flow control */
|
||||
|
||||
#define CIBAUD 07600000
|
||||
#define IBSHIFT 16
|
||||
|
||||
/* c_lflag bits */
|
||||
#define ISIG 0x00000080
|
||||
#define ICANON 0x00000100
|
||||
|
|
|
@ -526,24 +526,19 @@ SYSCALL_DEFINE4(osf_mount, unsigned long, typenr, const char __user *, path,
|
|||
SYSCALL_DEFINE1(osf_utsname, char __user *, name)
|
||||
{
|
||||
int error;
|
||||
char tmp[5 * 32];
|
||||
|
||||
down_read(&uts_sem);
|
||||
error = -EFAULT;
|
||||
if (copy_to_user(name + 0, utsname()->sysname, 32))
|
||||
goto out;
|
||||
if (copy_to_user(name + 32, utsname()->nodename, 32))
|
||||
goto out;
|
||||
if (copy_to_user(name + 64, utsname()->release, 32))
|
||||
goto out;
|
||||
if (copy_to_user(name + 96, utsname()->version, 32))
|
||||
goto out;
|
||||
if (copy_to_user(name + 128, utsname()->machine, 32))
|
||||
goto out;
|
||||
|
||||
error = 0;
|
||||
out:
|
||||
memcpy(tmp + 0 * 32, utsname()->sysname, 32);
|
||||
memcpy(tmp + 1 * 32, utsname()->nodename, 32);
|
||||
memcpy(tmp + 2 * 32, utsname()->release, 32);
|
||||
memcpy(tmp + 3 * 32, utsname()->version, 32);
|
||||
memcpy(tmp + 4 * 32, utsname()->machine, 32);
|
||||
up_read(&uts_sem);
|
||||
return error;
|
||||
|
||||
if (copy_to_user(name, tmp, sizeof(tmp)))
|
||||
return -EFAULT;
|
||||
return 0;
|
||||
}
|
||||
|
||||
SYSCALL_DEFINE0(getpagesize)
|
||||
|
@ -561,24 +556,22 @@ SYSCALL_DEFINE0(getdtablesize)
|
|||
*/
|
||||
SYSCALL_DEFINE2(osf_getdomainname, char __user *, name, int, namelen)
|
||||
{
|
||||
unsigned len;
|
||||
int i;
|
||||
int len, err = 0;
|
||||
char *kname;
|
||||
char tmp[32];
|
||||
|
||||
if (!access_ok(VERIFY_WRITE, name, namelen))
|
||||
return -EFAULT;
|
||||
|
||||
len = namelen;
|
||||
if (len > 32)
|
||||
len = 32;
|
||||
if (namelen < 0 || namelen > 32)
|
||||
namelen = 32;
|
||||
|
||||
down_read(&uts_sem);
|
||||
for (i = 0; i < len; ++i) {
|
||||
__put_user(utsname()->domainname[i], name + i);
|
||||
if (utsname()->domainname[i] == '\0')
|
||||
break;
|
||||
}
|
||||
kname = utsname()->domainname;
|
||||
len = strnlen(kname, namelen);
|
||||
len = min(len + 1, namelen);
|
||||
memcpy(tmp, kname, len);
|
||||
up_read(&uts_sem);
|
||||
|
||||
if (copy_to_user(name, tmp, len))
|
||||
return -EFAULT;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -741,13 +734,14 @@ SYSCALL_DEFINE3(osf_sysinfo, int, command, char __user *, buf, long, count)
|
|||
};
|
||||
unsigned long offset;
|
||||
const char *res;
|
||||
long len, err = -EINVAL;
|
||||
long len;
|
||||
char tmp[__NEW_UTS_LEN + 1];
|
||||
|
||||
offset = command-1;
|
||||
if (offset >= ARRAY_SIZE(sysinfo_table)) {
|
||||
/* Digital UNIX has a few unpublished interfaces here */
|
||||
printk("sysinfo(%d)", command);
|
||||
goto out;
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
down_read(&uts_sem);
|
||||
|
@ -755,13 +749,11 @@ SYSCALL_DEFINE3(osf_sysinfo, int, command, char __user *, buf, long, count)
|
|||
len = strlen(res)+1;
|
||||
if ((unsigned long)len > (unsigned long)count)
|
||||
len = count;
|
||||
if (copy_to_user(buf, res, len))
|
||||
err = -EFAULT;
|
||||
else
|
||||
err = 0;
|
||||
memcpy(tmp, res, len);
|
||||
up_read(&uts_sem);
|
||||
out:
|
||||
return err;
|
||||
if (copy_to_user(buf, tmp, len))
|
||||
return -EFAULT;
|
||||
return 0;
|
||||
}
|
||||
|
||||
SYSCALL_DEFINE5(osf_getsysinfo, unsigned long, op, void __user *, buffer,
|
||||
|
|
|
@ -77,7 +77,7 @@ __load_new_mm_context(struct mm_struct *next_mm)
|
|||
/* Macro for exception fixup code to access integer registers. */
|
||||
#define dpf_reg(r) \
|
||||
(((unsigned long *)regs)[(r) <= 8 ? (r) : (r) <= 15 ? (r)-16 : \
|
||||
(r) <= 18 ? (r)+8 : (r)-10])
|
||||
(r) <= 18 ? (r)+10 : (r)-10])
|
||||
|
||||
asmlinkage void
|
||||
do_page_fault(unsigned long address, unsigned long mmcsr,
|
||||
|
|
|
@ -23,7 +23,7 @@ config ARC
|
|||
select GENERIC_SMP_IDLE_THREAD
|
||||
select HAVE_ARCH_KGDB
|
||||
select HAVE_ARCH_TRACEHOOK
|
||||
select HAVE_FUTEX_CMPXCHG
|
||||
select HAVE_FUTEX_CMPXCHG if FUTEX
|
||||
select HAVE_IOREMAP_PROT
|
||||
select HAVE_KPROBES
|
||||
select HAVE_KRETPROBES
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
CONFIG_DEFAULT_HOSTNAME="ARCLinux"
|
||||
# CONFIG_SWAP is not set
|
||||
CONFIG_SYSVIPC=y
|
||||
CONFIG_POSIX_MQUEUE=y
|
||||
# CONFIG_CROSS_MEMORY_ATTACH is not set
|
||||
|
@ -98,6 +97,7 @@ CONFIG_VFAT_FS=y
|
|||
CONFIG_NTFS_FS=y
|
||||
CONFIG_TMPFS=y
|
||||
CONFIG_NFS_FS=y
|
||||
CONFIG_NFS_V3_ACL=y
|
||||
CONFIG_NLS_CODEPAGE_437=y
|
||||
CONFIG_NLS_ISO8859_1=y
|
||||
# CONFIG_ENABLE_WARN_DEPRECATED is not set
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
CONFIG_DEFAULT_HOSTNAME="ARCLinux"
|
||||
# CONFIG_SWAP is not set
|
||||
CONFIG_SYSVIPC=y
|
||||
CONFIG_POSIX_MQUEUE=y
|
||||
# CONFIG_CROSS_MEMORY_ATTACH is not set
|
||||
|
@ -98,6 +97,7 @@ CONFIG_VFAT_FS=y
|
|||
CONFIG_NTFS_FS=y
|
||||
CONFIG_TMPFS=y
|
||||
CONFIG_NFS_FS=y
|
||||
CONFIG_NFS_V3_ACL=y
|
||||
CONFIG_NLS_CODEPAGE_437=y
|
||||
CONFIG_NLS_ISO8859_1=y
|
||||
# CONFIG_ENABLE_WARN_DEPRECATED is not set
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
CONFIG_DEFAULT_HOSTNAME="ARCLinux"
|
||||
# CONFIG_SWAP is not set
|
||||
CONFIG_SYSVIPC=y
|
||||
CONFIG_POSIX_MQUEUE=y
|
||||
# CONFIG_CROSS_MEMORY_ATTACH is not set
|
||||
|
@ -99,6 +98,7 @@ CONFIG_VFAT_FS=y
|
|||
CONFIG_NTFS_FS=y
|
||||
CONFIG_TMPFS=y
|
||||
CONFIG_NFS_FS=y
|
||||
CONFIG_NFS_V3_ACL=y
|
||||
CONFIG_NLS_CODEPAGE_437=y
|
||||
CONFIG_NLS_ISO8859_1=y
|
||||
# CONFIG_ENABLE_WARN_DEPRECATED is not set
|
||||
|
|
|
@ -76,6 +76,7 @@ CONFIG_PROC_KCORE=y
|
|||
CONFIG_TMPFS=y
|
||||
# CONFIG_MISC_FILESYSTEMS is not set
|
||||
CONFIG_NFS_FS=y
|
||||
CONFIG_NFS_V3_ACL=y
|
||||
CONFIG_ROOT_NFS=y
|
||||
CONFIG_DEBUG_INFO=y
|
||||
# CONFIG_ENABLE_WARN_DEPRECATED is not set
|
||||
|
|
|
@ -71,5 +71,6 @@ CONFIG_EXT2_FS_XATTR=y
|
|||
CONFIG_TMPFS=y
|
||||
# CONFIG_MISC_FILESYSTEMS is not set
|
||||
CONFIG_NFS_FS=y
|
||||
CONFIG_NFS_V3_ACL=y
|
||||
# CONFIG_ENABLE_WARN_DEPRECATED is not set
|
||||
# CONFIG_ENABLE_MUST_CHECK is not set
|
||||
|
|
|
@ -69,5 +69,6 @@ CONFIG_EXT2_FS_XATTR=y
|
|||
CONFIG_TMPFS=y
|
||||
# CONFIG_MISC_FILESYSTEMS is not set
|
||||
CONFIG_NFS_FS=y
|
||||
CONFIG_NFS_V3_ACL=y
|
||||
# CONFIG_ENABLE_WARN_DEPRECATED is not set
|
||||
# CONFIG_ENABLE_MUST_CHECK is not set
|
||||
|
|
|
@ -80,6 +80,7 @@ CONFIG_EXT2_FS_XATTR=y
|
|||
CONFIG_TMPFS=y
|
||||
# CONFIG_MISC_FILESYSTEMS is not set
|
||||
CONFIG_NFS_FS=y
|
||||
CONFIG_NFS_V3_ACL=y
|
||||
# CONFIG_ENABLE_WARN_DEPRECATED is not set
|
||||
# CONFIG_ENABLE_MUST_CHECK is not set
|
||||
CONFIG_FTRACE=y
|
||||
|
|
|
@ -88,6 +88,7 @@ CONFIG_NTFS_FS=y
|
|||
CONFIG_TMPFS=y
|
||||
CONFIG_JFFS2_FS=y
|
||||
CONFIG_NFS_FS=y
|
||||
CONFIG_NFS_V3_ACL=y
|
||||
CONFIG_NLS_CODEPAGE_437=y
|
||||
CONFIG_NLS_ISO8859_1=y
|
||||
# CONFIG_ENABLE_WARN_DEPRECATED is not set
|
||||
|
|
|
@ -87,6 +87,7 @@ CONFIG_NTFS_FS=y
|
|||
CONFIG_TMPFS=y
|
||||
CONFIG_JFFS2_FS=y
|
||||
CONFIG_NFS_FS=y
|
||||
CONFIG_NFS_V3_ACL=y
|
||||
CONFIG_NLS_CODEPAGE_437=y
|
||||
CONFIG_NLS_ISO8859_1=y
|
||||
# CONFIG_ENABLE_WARN_DEPRECATED is not set
|
||||
|
|
|
@ -84,7 +84,7 @@ static inline int atomic_fetch_##op(int i, atomic_t *v) \
|
|||
"1: llock %[orig], [%[ctr]] \n" \
|
||||
" " #asm_op " %[val], %[orig], %[i] \n" \
|
||||
" scond %[val], [%[ctr]] \n" \
|
||||
" \n" \
|
||||
" bnz 1b \n" \
|
||||
: [val] "=&r" (val), \
|
||||
[orig] "=&r" (orig) \
|
||||
: [ctr] "r" (&v->counter), \
|
||||
|
|
|
@ -340,7 +340,7 @@ static inline __attribute__ ((const)) int __fls(unsigned long x)
|
|||
/*
|
||||
* __ffs: Similar to ffs, but zero based (0-31)
|
||||
*/
|
||||
static inline __attribute__ ((const)) int __ffs(unsigned long word)
|
||||
static inline __attribute__ ((const)) unsigned long __ffs(unsigned long word)
|
||||
{
|
||||
if (!word)
|
||||
return word;
|
||||
|
@ -400,9 +400,9 @@ static inline __attribute__ ((const)) int ffs(unsigned long x)
|
|||
/*
|
||||
* __ffs: Similar to ffs, but zero based (0-31)
|
||||
*/
|
||||
static inline __attribute__ ((const)) int __ffs(unsigned long x)
|
||||
static inline __attribute__ ((const)) unsigned long __ffs(unsigned long x)
|
||||
{
|
||||
int n;
|
||||
unsigned long n;
|
||||
|
||||
asm volatile(
|
||||
" ffs.f %0, %1 \n" /* 0:31; 31(Z) if src 0 */
|
||||
|
|
|
@ -23,7 +23,8 @@ void die(const char *str, struct pt_regs *regs, unsigned long address);
|
|||
|
||||
#define BUG() do { \
|
||||
pr_warn("BUG: failure at %s:%d/%s()!\n", __FILE__, __LINE__, __func__); \
|
||||
dump_stack(); \
|
||||
barrier_before_unreachable(); \
|
||||
__builtin_trap(); \
|
||||
} while (0)
|
||||
|
||||
#define HAVE_ARCH_BUG
|
||||
|
|
|
@ -49,6 +49,17 @@
|
|||
|
||||
#define ARCH_DMA_MINALIGN L1_CACHE_BYTES
|
||||
|
||||
/*
|
||||
* Make sure slab-allocated buffers are 64-bit aligned when atomic64_t uses
|
||||
* ARCv2 64-bit atomics (LLOCKD/SCONDD). This guarantess runtime 64-bit
|
||||
* alignment for any atomic64_t embedded in buffer.
|
||||
* Default ARCH_SLAB_MINALIGN is __alignof__(long long) which has a relaxed
|
||||
* value of 4 (and not 8) in ARC ABI.
|
||||
*/
|
||||
#if defined(CONFIG_ARC_HAS_LL64) && defined(CONFIG_ARC_HAS_LLSC)
|
||||
#define ARCH_SLAB_MINALIGN 8
|
||||
#endif
|
||||
|
||||
extern void arc_cache_init(void);
|
||||
extern char *arc_cache_mumbojumbo(int cpu_id, char *buf, int len);
|
||||
extern void read_decode_cache_bcr(void);
|
||||
|
|
|
@ -92,8 +92,11 @@ __cmpxchg(volatile void *ptr, unsigned long expected, unsigned long new)
|
|||
|
||||
#endif /* CONFIG_ARC_HAS_LLSC */
|
||||
|
||||
#define cmpxchg(ptr, o, n) ((typeof(*(ptr)))__cmpxchg((ptr), \
|
||||
(unsigned long)(o), (unsigned long)(n)))
|
||||
#define cmpxchg(ptr, o, n) ({ \
|
||||
(typeof(*(ptr)))__cmpxchg((ptr), \
|
||||
(unsigned long)(o), \
|
||||
(unsigned long)(n)); \
|
||||
})
|
||||
|
||||
/*
|
||||
* atomic_cmpxchg is same as cmpxchg
|
||||
|
@ -198,8 +201,11 @@ static inline unsigned long __xchg(unsigned long val, volatile void *ptr,
|
|||
return __xchg_bad_pointer();
|
||||
}
|
||||
|
||||
#define xchg(ptr, with) ((typeof(*(ptr)))__xchg((unsigned long)(with), (ptr), \
|
||||
sizeof(*(ptr))))
|
||||
#define xchg(ptr, with) ({ \
|
||||
(typeof(*(ptr)))__xchg((unsigned long)(with), \
|
||||
(ptr), \
|
||||
sizeof(*(ptr))); \
|
||||
})
|
||||
|
||||
#endif /* CONFIG_ARC_PLAT_EZNPS */
|
||||
|
||||
|
|
|
@ -17,8 +17,11 @@
|
|||
#ifndef __ASM_ARC_UDELAY_H
|
||||
#define __ASM_ARC_UDELAY_H
|
||||
|
||||
#include <asm-generic/types.h>
|
||||
#include <asm/param.h> /* HZ */
|
||||
|
||||
extern unsigned long loops_per_jiffy;
|
||||
|
||||
static inline void __delay(unsigned long loops)
|
||||
{
|
||||
__asm__ __volatile__(
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#include <linux/types.h>
|
||||
#include <asm/byteorder.h>
|
||||
#include <asm/page.h>
|
||||
#include <asm/unaligned.h>
|
||||
|
||||
#ifdef CONFIG_ISA_ARCV2
|
||||
#include <asm/barrier.h>
|
||||
|
@ -94,6 +95,42 @@ static inline u32 __raw_readl(const volatile void __iomem *addr)
|
|||
return w;
|
||||
}
|
||||
|
||||
/*
|
||||
* {read,write}s{b,w,l}() repeatedly access the same IO address in
|
||||
* native endianness in 8-, 16-, 32-bit chunks {into,from} memory,
|
||||
* @count times
|
||||
*/
|
||||
#define __raw_readsx(t,f) \
|
||||
static inline void __raw_reads##f(const volatile void __iomem *addr, \
|
||||
void *ptr, unsigned int count) \
|
||||
{ \
|
||||
bool is_aligned = ((unsigned long)ptr % ((t) / 8)) == 0; \
|
||||
u##t *buf = ptr; \
|
||||
\
|
||||
if (!count) \
|
||||
return; \
|
||||
\
|
||||
/* Some ARC CPU's don't support unaligned accesses */ \
|
||||
if (is_aligned) { \
|
||||
do { \
|
||||
u##t x = __raw_read##f(addr); \
|
||||
*buf++ = x; \
|
||||
} while (--count); \
|
||||
} else { \
|
||||
do { \
|
||||
u##t x = __raw_read##f(addr); \
|
||||
put_unaligned(x, buf++); \
|
||||
} while (--count); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define __raw_readsb __raw_readsb
|
||||
__raw_readsx(8, b)
|
||||
#define __raw_readsw __raw_readsw
|
||||
__raw_readsx(16, w)
|
||||
#define __raw_readsl __raw_readsl
|
||||
__raw_readsx(32, l)
|
||||
|
||||
#define __raw_writeb __raw_writeb
|
||||
static inline void __raw_writeb(u8 b, volatile void __iomem *addr)
|
||||
{
|
||||
|
@ -126,6 +163,35 @@ static inline void __raw_writel(u32 w, volatile void __iomem *addr)
|
|||
|
||||
}
|
||||
|
||||
#define __raw_writesx(t,f) \
|
||||
static inline void __raw_writes##f(volatile void __iomem *addr, \
|
||||
const void *ptr, unsigned int count) \
|
||||
{ \
|
||||
bool is_aligned = ((unsigned long)ptr % ((t) / 8)) == 0; \
|
||||
const u##t *buf = ptr; \
|
||||
\
|
||||
if (!count) \
|
||||
return; \
|
||||
\
|
||||
/* Some ARC CPU's don't support unaligned accesses */ \
|
||||
if (is_aligned) { \
|
||||
do { \
|
||||
__raw_write##f(*buf++, addr); \
|
||||
} while (--count); \
|
||||
} else { \
|
||||
do { \
|
||||
__raw_write##f(get_unaligned(buf++), addr); \
|
||||
} while (--count); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define __raw_writesb __raw_writesb
|
||||
__raw_writesx(8, b)
|
||||
#define __raw_writesw __raw_writesw
|
||||
__raw_writesx(16, w)
|
||||
#define __raw_writesl __raw_writesl
|
||||
__raw_writesx(32, l)
|
||||
|
||||
/*
|
||||
* MMIO can also get buffered/optimized in micro-arch, so barriers needed
|
||||
* Based on ARM model for the typical use case
|
||||
|
@ -141,10 +207,16 @@ static inline void __raw_writel(u32 w, volatile void __iomem *addr)
|
|||
#define readb(c) ({ u8 __v = readb_relaxed(c); __iormb(); __v; })
|
||||
#define readw(c) ({ u16 __v = readw_relaxed(c); __iormb(); __v; })
|
||||
#define readl(c) ({ u32 __v = readl_relaxed(c); __iormb(); __v; })
|
||||
#define readsb(p,d,l) ({ __raw_readsb(p,d,l); __iormb(); })
|
||||
#define readsw(p,d,l) ({ __raw_readsw(p,d,l); __iormb(); })
|
||||
#define readsl(p,d,l) ({ __raw_readsl(p,d,l); __iormb(); })
|
||||
|
||||
#define writeb(v,c) ({ __iowmb(); writeb_relaxed(v,c); })
|
||||
#define writew(v,c) ({ __iowmb(); writew_relaxed(v,c); })
|
||||
#define writel(v,c) ({ __iowmb(); writel_relaxed(v,c); })
|
||||
#define writesb(p,d,l) ({ __iowmb(); __raw_writesb(p,d,l); })
|
||||
#define writesw(p,d,l) ({ __iowmb(); __raw_writesw(p,d,l); })
|
||||
#define writesl(p,d,l) ({ __iowmb(); __raw_writesl(p,d,l); })
|
||||
|
||||
/*
|
||||
* Relaxed API for drivers which can handle barrier ordering themselves
|
||||
|
|
|
@ -34,9 +34,7 @@ struct machine_desc {
|
|||
const char *name;
|
||||
const char **dt_compat;
|
||||
void (*init_early)(void);
|
||||
#ifdef CONFIG_SMP
|
||||
void (*init_per_cpu)(unsigned int);
|
||||
#endif
|
||||
void (*init_machine)(void);
|
||||
void (*init_late)(void);
|
||||
|
||||
|
|
|
@ -103,7 +103,8 @@ static const char * const arc_pmu_ev_hw_map[] = {
|
|||
|
||||
/* counts condition */
|
||||
[PERF_COUNT_HW_INSTRUCTIONS] = "iall",
|
||||
[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = "ijmp", /* Excludes ZOL jumps */
|
||||
/* All jump instructions that are taken */
|
||||
[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = "ijmptak",
|
||||
[PERF_COUNT_ARC_BPOK] = "bpok", /* NP-NT, PT-T, PNT-NT */
|
||||
#ifdef CONFIG_ISA_ARCV2
|
||||
[PERF_COUNT_HW_BRANCH_MISSES] = "bpmp",
|
||||
|
|
|
@ -209,7 +209,7 @@ __arc_copy_from_user(void *to, const void __user *from, unsigned long n)
|
|||
*/
|
||||
"=&r" (tmp), "+r" (to), "+r" (from)
|
||||
:
|
||||
: "lp_count", "lp_start", "lp_end", "memory");
|
||||
: "lp_count", "memory");
|
||||
|
||||
return n;
|
||||
}
|
||||
|
@ -438,7 +438,7 @@ __arc_copy_to_user(void __user *to, const void *from, unsigned long n)
|
|||
*/
|
||||
"=&r" (tmp), "+r" (to), "+r" (from)
|
||||
:
|
||||
: "lp_count", "lp_start", "lp_end", "memory");
|
||||
: "lp_count", "memory");
|
||||
|
||||
return n;
|
||||
}
|
||||
|
@ -658,7 +658,7 @@ static inline unsigned long __arc_clear_user(void __user *to, unsigned long n)
|
|||
" .previous \n"
|
||||
: "+r"(d_char), "+r"(res)
|
||||
: "i"(0)
|
||||
: "lp_count", "lp_start", "lp_end", "memory");
|
||||
: "lp_count", "memory");
|
||||
|
||||
return res;
|
||||
}
|
||||
|
@ -691,7 +691,7 @@ __arc_strncpy_from_user(char *dst, const char __user *src, long count)
|
|||
" .previous \n"
|
||||
: "+r"(res), "+r"(dst), "+r"(src), "=r"(val)
|
||||
: "g"(-EFAULT), "r"(count)
|
||||
: "lp_count", "lp_start", "lp_end", "memory");
|
||||
: "lp_count", "memory");
|
||||
|
||||
return res;
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#include <asm/entry.h>
|
||||
#include <asm/arcregs.h>
|
||||
#include <asm/cache.h>
|
||||
#include <asm/irqflags.h>
|
||||
|
||||
.macro CPU_EARLY_SETUP
|
||||
|
||||
|
@ -47,6 +48,15 @@
|
|||
sr r5, [ARC_REG_DC_CTRL]
|
||||
|
||||
1:
|
||||
|
||||
#ifdef CONFIG_ISA_ARCV2
|
||||
; Unaligned access is disabled at reset, so re-enable early as
|
||||
; gcc 7.3.1 (ARC GNU 2018.03) onwards generates unaligned access
|
||||
; by default
|
||||
lr r5, [status32]
|
||||
bset r5, r5, STATUS_AD_BIT
|
||||
kflag r5
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.section .init.text, "ax",@progbits
|
||||
|
@ -93,10 +103,11 @@ ENTRY(stext)
|
|||
#ifdef CONFIG_ARC_UBOOT_SUPPORT
|
||||
; Uboot - kernel ABI
|
||||
; r0 = [0] No uboot interaction, [1] cmdline in r2, [2] DTB in r2
|
||||
; r1 = magic number (board identity, unused as of now
|
||||
; r1 = magic number (always zero as of now)
|
||||
; r2 = pointer to uboot provided cmdline or external DTB in mem
|
||||
; These are handled later in setup_arch()
|
||||
; These are handled later in handle_uboot_args()
|
||||
st r0, [@uboot_tag]
|
||||
st r1, [@uboot_magic]
|
||||
st r2, [@uboot_arg]
|
||||
#endif
|
||||
|
||||
|
|
|
@ -31,10 +31,10 @@ void __init init_IRQ(void)
|
|||
/* a SMP H/w block could do IPI IRQ request here */
|
||||
if (plat_smp_ops.init_per_cpu)
|
||||
plat_smp_ops.init_per_cpu(smp_processor_id());
|
||||
#endif
|
||||
|
||||
if (machine_desc->init_per_cpu)
|
||||
machine_desc->init_per_cpu(smp_processor_id());
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -488,8 +488,8 @@ static int arc_pmu_device_probe(struct platform_device *pdev)
|
|||
/* loop thru all available h/w condition indexes */
|
||||
for (j = 0; j < cc_bcr.c; j++) {
|
||||
write_aux_reg(ARC_REG_CC_INDEX, j);
|
||||
cc_name.indiv.word0 = read_aux_reg(ARC_REG_CC_NAME0);
|
||||
cc_name.indiv.word1 = read_aux_reg(ARC_REG_CC_NAME1);
|
||||
cc_name.indiv.word0 = le32_to_cpu(read_aux_reg(ARC_REG_CC_NAME0));
|
||||
cc_name.indiv.word1 = le32_to_cpu(read_aux_reg(ARC_REG_CC_NAME1));
|
||||
|
||||
/* See if it has been mapped to a perf event_id */
|
||||
for (i = 0; i < ARRAY_SIZE(arc_pmu_ev_hw_map); i++) {
|
||||
|
|
|
@ -44,7 +44,8 @@ SYSCALL_DEFINE0(arc_gettls)
|
|||
SYSCALL_DEFINE3(arc_usr_cmpxchg, int *, uaddr, int, expected, int, new)
|
||||
{
|
||||
struct pt_regs *regs = current_pt_regs();
|
||||
int uval = -EFAULT;
|
||||
u32 uval;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* This is only for old cores lacking LLOCK/SCOND, which by defintion
|
||||
|
@ -57,23 +58,47 @@ SYSCALL_DEFINE3(arc_usr_cmpxchg, int *, uaddr, int, expected, int, new)
|
|||
/* Z indicates to userspace if operation succeded */
|
||||
regs->status32 &= ~STATUS_Z_MASK;
|
||||
|
||||
if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
|
||||
return -EFAULT;
|
||||
ret = access_ok(VERIFY_WRITE, uaddr, sizeof(*uaddr));
|
||||
if (!ret)
|
||||
goto fail;
|
||||
|
||||
again:
|
||||
preempt_disable();
|
||||
|
||||
if (__get_user(uval, uaddr))
|
||||
goto done;
|
||||
ret = __get_user(uval, uaddr);
|
||||
if (ret)
|
||||
goto fault;
|
||||
|
||||
if (uval == expected) {
|
||||
if (!__put_user(new, uaddr))
|
||||
regs->status32 |= STATUS_Z_MASK;
|
||||
}
|
||||
if (uval != expected)
|
||||
goto out;
|
||||
|
||||
done:
|
||||
ret = __put_user(new, uaddr);
|
||||
if (ret)
|
||||
goto fault;
|
||||
|
||||
regs->status32 |= STATUS_Z_MASK;
|
||||
|
||||
out:
|
||||
preempt_enable();
|
||||
return uval;
|
||||
|
||||
fault:
|
||||
preempt_enable();
|
||||
|
||||
return uval;
|
||||
if (unlikely(ret != -EFAULT))
|
||||
goto fail;
|
||||
|
||||
down_read(¤t->mm->mmap_sem);
|
||||
ret = fixup_user_fault(current, current->mm, (unsigned long) uaddr,
|
||||
FAULT_FLAG_WRITE, NULL);
|
||||
up_read(¤t->mm->mmap_sem);
|
||||
|
||||
if (likely(!ret))
|
||||
goto again;
|
||||
|
||||
fail:
|
||||
force_sig(SIGSEGV, current);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void arch_cpu_idle(void)
|
||||
|
@ -188,6 +213,26 @@ int copy_thread(unsigned long clone_flags,
|
|||
task_thread_info(current)->thr_ptr;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* setup usermode thread pointer #1:
|
||||
* when child is picked by scheduler, __switch_to() uses @c_callee to
|
||||
* populate usermode callee regs: this works (despite being in a kernel
|
||||
* function) since special return path for child @ret_from_fork()
|
||||
* ensures those regs are not clobbered all the way to RTIE to usermode
|
||||
*/
|
||||
c_callee->r25 = task_thread_info(p)->thr_ptr;
|
||||
|
||||
#ifdef CONFIG_ARC_CURR_IN_REG
|
||||
/*
|
||||
* setup usermode thread pointer #2:
|
||||
* however for this special use of r25 in kernel, __switch_to() sets
|
||||
* r25 for kernel needs and only in the final return path is usermode
|
||||
* r25 setup, from pt_regs->user_r25. So set that up as well
|
||||
*/
|
||||
c_regs->user_r25 = c_callee->r25;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -32,6 +32,7 @@ unsigned int intr_to_DE_cnt;
|
|||
|
||||
/* Part of U-boot ABI: see head.S */
|
||||
int __initdata uboot_tag;
|
||||
int __initdata uboot_magic;
|
||||
char __initdata *uboot_arg;
|
||||
|
||||
const struct machine_desc *machine_desc;
|
||||
|
@ -381,43 +382,87 @@ void setup_processor(void)
|
|||
arc_chk_core_config();
|
||||
}
|
||||
|
||||
static inline int is_kernel(unsigned long addr)
|
||||
static inline bool uboot_arg_invalid(unsigned long addr)
|
||||
{
|
||||
if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
|
||||
return 1;
|
||||
return 0;
|
||||
/*
|
||||
* Check that it is a untranslated address (although MMU is not enabled
|
||||
* yet, it being a high address ensures this is not by fluke)
|
||||
*/
|
||||
if (addr < PAGE_OFFSET)
|
||||
return true;
|
||||
|
||||
/* Check that address doesn't clobber resident kernel image */
|
||||
return addr >= (unsigned long)_stext && addr <= (unsigned long)_end;
|
||||
}
|
||||
|
||||
#define IGNORE_ARGS "Ignore U-boot args: "
|
||||
|
||||
/* uboot_tag values for U-boot - kernel ABI revision 0; see head.S */
|
||||
#define UBOOT_TAG_NONE 0
|
||||
#define UBOOT_TAG_CMDLINE 1
|
||||
#define UBOOT_TAG_DTB 2
|
||||
/* We always pass 0 as magic from U-boot */
|
||||
#define UBOOT_MAGIC_VALUE 0
|
||||
|
||||
void __init handle_uboot_args(void)
|
||||
{
|
||||
bool use_embedded_dtb = true;
|
||||
bool append_cmdline = false;
|
||||
|
||||
#ifdef CONFIG_ARC_UBOOT_SUPPORT
|
||||
/* check that we know this tag */
|
||||
if (uboot_tag != UBOOT_TAG_NONE &&
|
||||
uboot_tag != UBOOT_TAG_CMDLINE &&
|
||||
uboot_tag != UBOOT_TAG_DTB) {
|
||||
pr_warn(IGNORE_ARGS "invalid uboot tag: '%08x'\n", uboot_tag);
|
||||
goto ignore_uboot_args;
|
||||
}
|
||||
|
||||
if (uboot_magic != UBOOT_MAGIC_VALUE) {
|
||||
pr_warn(IGNORE_ARGS "non zero uboot magic\n");
|
||||
goto ignore_uboot_args;
|
||||
}
|
||||
|
||||
if (uboot_tag != UBOOT_TAG_NONE &&
|
||||
uboot_arg_invalid((unsigned long)uboot_arg)) {
|
||||
pr_warn(IGNORE_ARGS "invalid uboot arg: '%px'\n", uboot_arg);
|
||||
goto ignore_uboot_args;
|
||||
}
|
||||
|
||||
/* see if U-boot passed an external Device Tree blob */
|
||||
if (uboot_tag == UBOOT_TAG_DTB) {
|
||||
machine_desc = setup_machine_fdt((void *)uboot_arg);
|
||||
|
||||
/* external Device Tree blob is invalid - use embedded one */
|
||||
use_embedded_dtb = !machine_desc;
|
||||
}
|
||||
|
||||
if (uboot_tag == UBOOT_TAG_CMDLINE)
|
||||
append_cmdline = true;
|
||||
|
||||
ignore_uboot_args:
|
||||
#endif
|
||||
|
||||
if (use_embedded_dtb) {
|
||||
machine_desc = setup_machine_fdt(__dtb_start);
|
||||
if (!machine_desc)
|
||||
panic("Embedded DT invalid\n");
|
||||
}
|
||||
|
||||
/*
|
||||
* NOTE: @boot_command_line is populated by setup_machine_fdt() so this
|
||||
* append processing can only happen after.
|
||||
*/
|
||||
if (append_cmdline) {
|
||||
/* Ensure a whitespace between the 2 cmdlines */
|
||||
strlcat(boot_command_line, " ", COMMAND_LINE_SIZE);
|
||||
strlcat(boot_command_line, uboot_arg, COMMAND_LINE_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
void __init setup_arch(char **cmdline_p)
|
||||
{
|
||||
#ifdef CONFIG_ARC_UBOOT_SUPPORT
|
||||
/* make sure that uboot passed pointer to cmdline/dtb is valid */
|
||||
if (uboot_tag && is_kernel((unsigned long)uboot_arg))
|
||||
panic("Invalid uboot arg\n");
|
||||
|
||||
/* See if u-boot passed an external Device Tree blob */
|
||||
machine_desc = setup_machine_fdt(uboot_arg); /* uboot_tag == 2 */
|
||||
if (!machine_desc)
|
||||
#endif
|
||||
{
|
||||
/* No, so try the embedded one */
|
||||
machine_desc = setup_machine_fdt(__dtb_start);
|
||||
if (!machine_desc)
|
||||
panic("Embedded DT invalid\n");
|
||||
|
||||
/*
|
||||
* If we are here, it is established that @uboot_arg didn't
|
||||
* point to DT blob. Instead if u-boot says it is cmdline,
|
||||
* append to embedded DT cmdline.
|
||||
* setup_machine_fdt() would have populated @boot_command_line
|
||||
*/
|
||||
if (uboot_tag == 1) {
|
||||
/* Ensure a whitespace between the 2 cmdlines */
|
||||
strlcat(boot_command_line, " ", COMMAND_LINE_SIZE);
|
||||
strlcat(boot_command_line, uboot_arg,
|
||||
COMMAND_LINE_SIZE);
|
||||
}
|
||||
}
|
||||
handle_uboot_args();
|
||||
|
||||
/* Save unparsed command line copy for /proc/cmdline */
|
||||
*cmdline_p = boot_command_line;
|
||||
|
|
|
@ -155,3 +155,12 @@ void do_insterror_or_kprobe(unsigned long address, struct pt_regs *regs)
|
|||
|
||||
insterror_is_error(address, regs);
|
||||
}
|
||||
|
||||
/*
|
||||
* abort() call generated by older gcc for __builtin_trap()
|
||||
*/
|
||||
void abort(void)
|
||||
{
|
||||
__asm__ __volatile__("trap_s 5\n");
|
||||
}
|
||||
EXPORT_SYMBOL(abort);
|
||||
|
|
|
@ -185,11 +185,6 @@ static void *__init unw_hdr_alloc_early(unsigned long sz)
|
|||
MAX_DMA_ADDRESS);
|
||||
}
|
||||
|
||||
static void *unw_hdr_alloc(unsigned long sz)
|
||||
{
|
||||
return kmalloc(sz, GFP_KERNEL);
|
||||
}
|
||||
|
||||
static void init_unwind_table(struct unwind_table *table, const char *name,
|
||||
const void *core_start, unsigned long core_size,
|
||||
const void *init_start, unsigned long init_size,
|
||||
|
@ -370,6 +365,10 @@ ret_err:
|
|||
}
|
||||
|
||||
#ifdef CONFIG_MODULES
|
||||
static void *unw_hdr_alloc(unsigned long sz)
|
||||
{
|
||||
return kmalloc(sz, GFP_KERNEL);
|
||||
}
|
||||
|
||||
static struct unwind_table *last_table;
|
||||
|
||||
|
|
|
@ -25,15 +25,11 @@
|
|||
#endif
|
||||
|
||||
#ifdef CONFIG_ARC_HAS_LL64
|
||||
# define PREFETCH_READ(RX) prefetch [RX, 56]
|
||||
# define PREFETCH_WRITE(RX) prefetchw [RX, 64]
|
||||
# define LOADX(DST,RX) ldd.ab DST, [RX, 8]
|
||||
# define STOREX(SRC,RX) std.ab SRC, [RX, 8]
|
||||
# define ZOLSHFT 5
|
||||
# define ZOLAND 0x1F
|
||||
#else
|
||||
# define PREFETCH_READ(RX) prefetch [RX, 28]
|
||||
# define PREFETCH_WRITE(RX) prefetchw [RX, 32]
|
||||
# define LOADX(DST,RX) ld.ab DST, [RX, 4]
|
||||
# define STOREX(SRC,RX) st.ab SRC, [RX, 4]
|
||||
# define ZOLSHFT 4
|
||||
|
@ -41,8 +37,6 @@
|
|||
#endif
|
||||
|
||||
ENTRY_CFI(memcpy)
|
||||
prefetch [r1] ; Prefetch the read location
|
||||
prefetchw [r0] ; Prefetch the write location
|
||||
mov.f 0, r2
|
||||
;;; if size is zero
|
||||
jz.d [blink]
|
||||
|
@ -72,8 +66,6 @@ ENTRY_CFI(memcpy)
|
|||
lpnz @.Lcopy32_64bytes
|
||||
;; LOOP START
|
||||
LOADX (r6, r1)
|
||||
PREFETCH_READ (r1)
|
||||
PREFETCH_WRITE (r3)
|
||||
LOADX (r8, r1)
|
||||
LOADX (r10, r1)
|
||||
LOADX (r4, r1)
|
||||
|
@ -117,9 +109,7 @@ ENTRY_CFI(memcpy)
|
|||
lpnz @.Lcopy8bytes_1
|
||||
;; LOOP START
|
||||
ld.ab r6, [r1, 4]
|
||||
prefetch [r1, 28] ;Prefetch the next read location
|
||||
ld.ab r8, [r1,4]
|
||||
prefetchw [r3, 32] ;Prefetch the next write location
|
||||
|
||||
SHIFT_1 (r7, r6, 24)
|
||||
or r7, r7, r5
|
||||
|
@ -162,9 +152,7 @@ ENTRY_CFI(memcpy)
|
|||
lpnz @.Lcopy8bytes_2
|
||||
;; LOOP START
|
||||
ld.ab r6, [r1, 4]
|
||||
prefetch [r1, 28] ;Prefetch the next read location
|
||||
ld.ab r8, [r1,4]
|
||||
prefetchw [r3, 32] ;Prefetch the next write location
|
||||
|
||||
SHIFT_1 (r7, r6, 16)
|
||||
or r7, r7, r5
|
||||
|
@ -204,9 +192,7 @@ ENTRY_CFI(memcpy)
|
|||
lpnz @.Lcopy8bytes_3
|
||||
;; LOOP START
|
||||
ld.ab r6, [r1, 4]
|
||||
prefetch [r1, 28] ;Prefetch the next read location
|
||||
ld.ab r8, [r1,4]
|
||||
prefetchw [r3, 32] ;Prefetch the next write location
|
||||
|
||||
SHIFT_1 (r7, r6, 8)
|
||||
or r7, r7, r5
|
||||
|
|
|
@ -7,11 +7,39 @@
|
|||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
#include <asm/cache.h>
|
||||
|
||||
#undef PREALLOC_NOT_AVAIL
|
||||
/*
|
||||
* The memset implementation below is optimized to use prefetchw and prealloc
|
||||
* instruction in case of CPU with 64B L1 data cache line (L1_CACHE_SHIFT == 6)
|
||||
* If you want to implement optimized memset for other possible L1 data cache
|
||||
* line lengths (32B and 128B) you should rewrite code carefully checking
|
||||
* we don't call any prefetchw/prealloc instruction for L1 cache lines which
|
||||
* don't belongs to memset area.
|
||||
*/
|
||||
|
||||
#if L1_CACHE_SHIFT == 6
|
||||
|
||||
.macro PREALLOC_INSTR reg, off
|
||||
prealloc [\reg, \off]
|
||||
.endm
|
||||
|
||||
.macro PREFETCHW_INSTR reg, off
|
||||
prefetchw [\reg, \off]
|
||||
.endm
|
||||
|
||||
#else
|
||||
|
||||
.macro PREALLOC_INSTR
|
||||
.endm
|
||||
|
||||
.macro PREFETCHW_INSTR
|
||||
.endm
|
||||
|
||||
#endif
|
||||
|
||||
ENTRY_CFI(memset)
|
||||
prefetchw [r0] ; Prefetch the write location
|
||||
PREFETCHW_INSTR r0, 0 ; Prefetch the first write location
|
||||
mov.f 0, r2
|
||||
;;; if size is zero
|
||||
jz.d [blink]
|
||||
|
@ -48,11 +76,8 @@ ENTRY_CFI(memset)
|
|||
|
||||
lpnz @.Lset64bytes
|
||||
;; LOOP START
|
||||
#ifdef PREALLOC_NOT_AVAIL
|
||||
prefetchw [r3, 64] ;Prefetch the next write location
|
||||
#else
|
||||
prealloc [r3, 64]
|
||||
#endif
|
||||
PREALLOC_INSTR r3, 64 ; alloc next line w/o fetching
|
||||
|
||||
#ifdef CONFIG_ARC_HAS_LL64
|
||||
std.ab r4, [r3, 8]
|
||||
std.ab r4, [r3, 8]
|
||||
|
@ -85,7 +110,6 @@ ENTRY_CFI(memset)
|
|||
lsr.f lp_count, r2, 5 ;Last remaining max 124 bytes
|
||||
lpnz .Lset32bytes
|
||||
;; LOOP START
|
||||
prefetchw [r3, 32] ;Prefetch the next write location
|
||||
#ifdef CONFIG_ARC_HAS_LL64
|
||||
std.ab r4, [r3, 8]
|
||||
std.ab r4, [r3, 8]
|
||||
|
|
|
@ -840,7 +840,7 @@ void flush_cache_mm(struct mm_struct *mm)
|
|||
void flush_cache_page(struct vm_area_struct *vma, unsigned long u_vaddr,
|
||||
unsigned long pfn)
|
||||
{
|
||||
unsigned int paddr = pfn << PAGE_SHIFT;
|
||||
phys_addr_t paddr = pfn << PAGE_SHIFT;
|
||||
|
||||
u_vaddr &= PAGE_MASK;
|
||||
|
||||
|
@ -860,8 +860,9 @@ void flush_anon_page(struct vm_area_struct *vma, struct page *page,
|
|||
unsigned long u_vaddr)
|
||||
{
|
||||
/* TBD: do we really need to clear the kernel mapping */
|
||||
__flush_dcache_page(page_address(page), u_vaddr);
|
||||
__flush_dcache_page(page_address(page), page_address(page));
|
||||
__flush_dcache_page((phys_addr_t)page_address(page), u_vaddr);
|
||||
__flush_dcache_page((phys_addr_t)page_address(page),
|
||||
(phys_addr_t)page_address(page));
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -890,9 +890,11 @@ void do_tlb_overlap_fault(unsigned long cause, unsigned long address,
|
|||
struct pt_regs *regs)
|
||||
{
|
||||
struct cpuinfo_arc_mmu *mmu = &cpuinfo_arc700[smp_processor_id()].mmu;
|
||||
unsigned int pd0[mmu->ways];
|
||||
unsigned long flags;
|
||||
int set;
|
||||
int set, n_ways = mmu->ways;
|
||||
|
||||
n_ways = min(n_ways, 4);
|
||||
BUG_ON(mmu->ways > 4);
|
||||
|
||||
local_irq_save(flags);
|
||||
|
||||
|
@ -900,9 +902,10 @@ void do_tlb_overlap_fault(unsigned long cause, unsigned long address,
|
|||
for (set = 0; set < mmu->sets; set++) {
|
||||
|
||||
int is_valid, way;
|
||||
unsigned int pd0[4];
|
||||
|
||||
/* read out all the ways of current set */
|
||||
for (way = 0, is_valid = 0; way < mmu->ways; way++) {
|
||||
for (way = 0, is_valid = 0; way < n_ways; way++) {
|
||||
write_aux_reg(ARC_REG_TLBINDEX,
|
||||
SET_WAY_TO_IDX(mmu, set, way));
|
||||
write_aux_reg(ARC_REG_TLBCOMMAND, TLBRead);
|
||||
|
@ -916,14 +919,14 @@ void do_tlb_overlap_fault(unsigned long cause, unsigned long address,
|
|||
continue;
|
||||
|
||||
/* Scan the set for duplicate ways: needs a nested loop */
|
||||
for (way = 0; way < mmu->ways - 1; way++) {
|
||||
for (way = 0; way < n_ways - 1; way++) {
|
||||
|
||||
int n;
|
||||
|
||||
if (!pd0[way])
|
||||
continue;
|
||||
|
||||
for (n = way + 1; n < mmu->ways; n++) {
|
||||
for (n = way + 1; n < n_ways; n++) {
|
||||
if (pd0[way] != pd0[n])
|
||||
continue;
|
||||
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#error "Incorrect ctop.h include"
|
||||
#endif
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <soc/nps/common.h>
|
||||
|
||||
/* core auxiliary registers */
|
||||
|
|
|
@ -1457,6 +1457,7 @@ config NR_CPUS
|
|||
config HOTPLUG_CPU
|
||||
bool "Support for hot-pluggable CPUs"
|
||||
depends on SMP
|
||||
select GENERIC_IRQ_MIGRATION
|
||||
help
|
||||
Say Y here to experiment with turning CPUs off and on. CPUs
|
||||
can be controlled through /sys/devices/system/cpu.
|
||||
|
|
|
@ -987,14 +987,21 @@ choice
|
|||
Say Y here if you want kernel low-level debugging support
|
||||
on SOCFPGA(Cyclone 5 and Arria 5) based platforms.
|
||||
|
||||
config DEBUG_SOCFPGA_UART1
|
||||
config DEBUG_SOCFPGA_ARRIA10_UART1
|
||||
depends on ARCH_SOCFPGA
|
||||
bool "Use SOCFPGA UART1 for low-level debug"
|
||||
bool "Use SOCFPGA Arria10 UART1 for low-level debug"
|
||||
select DEBUG_UART_8250
|
||||
help
|
||||
Say Y here if you want kernel low-level debugging support
|
||||
on SOCFPGA(Arria 10) based platforms.
|
||||
|
||||
config DEBUG_SOCFPGA_CYCLONE5_UART1
|
||||
depends on ARCH_SOCFPGA
|
||||
bool "Use SOCFPGA Cyclone 5 UART1 for low-level debug"
|
||||
select DEBUG_UART_8250
|
||||
help
|
||||
Say Y here if you want kernel low-level debugging support
|
||||
on SOCFPGA(Cyclone 5 and Arria 5) based platforms.
|
||||
|
||||
config DEBUG_SUN9I_UART0
|
||||
bool "Kernel low-level debugging messages via sun9i UART0"
|
||||
|
@ -1340,21 +1347,21 @@ config DEBUG_OMAP2PLUS_UART
|
|||
depends on ARCH_OMAP2PLUS
|
||||
|
||||
config DEBUG_IMX_UART_PORT
|
||||
int "i.MX Debug UART Port Selection" if DEBUG_IMX1_UART || \
|
||||
DEBUG_IMX25_UART || \
|
||||
DEBUG_IMX21_IMX27_UART || \
|
||||
DEBUG_IMX31_UART || \
|
||||
DEBUG_IMX35_UART || \
|
||||
DEBUG_IMX50_UART || \
|
||||
DEBUG_IMX51_UART || \
|
||||
DEBUG_IMX53_UART || \
|
||||
DEBUG_IMX6Q_UART || \
|
||||
DEBUG_IMX6SL_UART || \
|
||||
DEBUG_IMX6SX_UART || \
|
||||
DEBUG_IMX6UL_UART || \
|
||||
DEBUG_IMX7D_UART
|
||||
int "i.MX Debug UART Port Selection"
|
||||
depends on DEBUG_IMX1_UART || \
|
||||
DEBUG_IMX25_UART || \
|
||||
DEBUG_IMX21_IMX27_UART || \
|
||||
DEBUG_IMX31_UART || \
|
||||
DEBUG_IMX35_UART || \
|
||||
DEBUG_IMX50_UART || \
|
||||
DEBUG_IMX51_UART || \
|
||||
DEBUG_IMX53_UART || \
|
||||
DEBUG_IMX6Q_UART || \
|
||||
DEBUG_IMX6SL_UART || \
|
||||
DEBUG_IMX6SX_UART || \
|
||||
DEBUG_IMX6UL_UART || \
|
||||
DEBUG_IMX7D_UART
|
||||
default 1
|
||||
depends on ARCH_MXC
|
||||
help
|
||||
Choose UART port on which kernel low-level debug messages
|
||||
should be output.
|
||||
|
@ -1534,7 +1541,8 @@ config DEBUG_UART_PHYS
|
|||
default 0xfe800000 if ARCH_IOP32X
|
||||
default 0xff690000 if DEBUG_RK32_UART2
|
||||
default 0xffc02000 if DEBUG_SOCFPGA_UART0
|
||||
default 0xffc02100 if DEBUG_SOCFPGA_UART1
|
||||
default 0xffc02100 if DEBUG_SOCFPGA_ARRIA10_UART1
|
||||
default 0xffc03000 if DEBUG_SOCFPGA_CYCLONE5_UART1
|
||||
default 0xffd82340 if ARCH_IOP13XX
|
||||
default 0xffe40000 if DEBUG_RCAR_GEN1_SCIF0
|
||||
default 0xffe42000 if DEBUG_RCAR_GEN1_SCIF2
|
||||
|
@ -1624,7 +1632,8 @@ config DEBUG_UART_VIRT
|
|||
default 0xfeb30c00 if DEBUG_KEYSTONE_UART0
|
||||
default 0xfeb31000 if DEBUG_KEYSTONE_UART1
|
||||
default 0xfec02000 if DEBUG_SOCFPGA_UART0
|
||||
default 0xfec02100 if DEBUG_SOCFPGA_UART1
|
||||
default 0xfec02100 if DEBUG_SOCFPGA_ARRIA10_UART1
|
||||
default 0xfec03000 if DEBUG_SOCFPGA_CYCLONE5_UART1
|
||||
default 0xfec12000 if (DEBUG_MVEBU_UART0 || DEBUG_MVEBU_UART0_ALTERNATE) && ARCH_MVEBU
|
||||
default 0xfec12100 if DEBUG_MVEBU_UART1_ALTERNATE
|
||||
default 0xfec10000 if DEBUG_SIRFATLAS7_UART0
|
||||
|
@ -1672,9 +1681,9 @@ config DEBUG_UART_8250_WORD
|
|||
depends on DEBUG_LL_UART_8250 || DEBUG_UART_8250
|
||||
depends on DEBUG_UART_8250_SHIFT >= 2
|
||||
default y if DEBUG_PICOXCELL_UART || \
|
||||
DEBUG_SOCFPGA_UART0 || DEBUG_SOCFPGA_UART1 || \
|
||||
DEBUG_KEYSTONE_UART0 || DEBUG_KEYSTONE_UART1 || \
|
||||
DEBUG_ALPINE_UART0 || \
|
||||
DEBUG_SOCFPGA_UART0 || DEBUG_SOCFPGA_ARRIA10_UART1 || \
|
||||
DEBUG_SOCFPGA_CYCLONE5_UART1 || DEBUG_KEYSTONE_UART0 || \
|
||||
DEBUG_KEYSTONE_UART1 || DEBUG_ALPINE_UART0 || \
|
||||
DEBUG_DAVINCI_DMx_UART0 || DEBUG_DAVINCI_DA8XX_UART1 || \
|
||||
DEBUG_DAVINCI_DA8XX_UART2 || \
|
||||
DEBUG_BCM_KONA_UART || DEBUG_RK32_UART2
|
||||
|
|
|
@ -104,7 +104,7 @@ tune-$(CONFIG_CPU_V6K) =$(call cc-option,-mtune=arm1136j-s,-mtune=strongarm)
|
|||
tune-y := $(tune-y)
|
||||
|
||||
ifeq ($(CONFIG_AEABI),y)
|
||||
CFLAGS_ABI :=-mabi=aapcs-linux -mno-thumb-interwork -mfpu=vfp
|
||||
CFLAGS_ABI :=-mabi=aapcs-linux -mfpu=vfp
|
||||
else
|
||||
CFLAGS_ABI :=$(call cc-option,-mapcs-32,-mabi=apcs-gnu) $(call cc-option,-mno-thumb-interwork,)
|
||||
endif
|
||||
|
|
|
@ -112,7 +112,7 @@ CFLAGS_fdt_ro.o := $(nossp_flags)
|
|||
CFLAGS_fdt_rw.o := $(nossp_flags)
|
||||
CFLAGS_fdt_wip.o := $(nossp_flags)
|
||||
|
||||
ccflags-y := -fpic -mno-single-pic-base -fno-builtin -I$(obj)
|
||||
ccflags-y := -fpic $(call cc-option,-mno-single-pic-base,) -fno-builtin -I$(obj)
|
||||
asflags-y := -DZIMAGE
|
||||
|
||||
# Supply kernel BSS size to the decompressor via a linker symbol.
|
||||
|
|
|
@ -17,14 +17,13 @@
|
|||
@ there.
|
||||
.inst 'M' | ('Z' << 8) | (0x1310 << 16) @ tstne r0, #0x4d000
|
||||
#else
|
||||
mov r0, r0
|
||||
AR_CLASS( mov r0, r0 )
|
||||
M_CLASS( nop.w )
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro __EFI_HEADER
|
||||
#ifdef CONFIG_EFI_STUB
|
||||
b __efi_start
|
||||
|
||||
.set start_offset, __efi_start - start
|
||||
.org start + 0x3c
|
||||
@
|
||||
|
|
|
@ -130,19 +130,22 @@ start:
|
|||
.rept 7
|
||||
__nop
|
||||
.endr
|
||||
ARM( mov r0, r0 )
|
||||
ARM( b 1f )
|
||||
THUMB( badr r12, 1f )
|
||||
THUMB( bx r12 )
|
||||
#ifndef CONFIG_THUMB2_KERNEL
|
||||
mov r0, r0
|
||||
#else
|
||||
AR_CLASS( sub pc, pc, #3 ) @ A/R: switch to Thumb2 mode
|
||||
M_CLASS( nop.w ) @ M: already in Thumb2 mode
|
||||
.thumb
|
||||
#endif
|
||||
W(b) 1f
|
||||
|
||||
.word _magic_sig @ Magic numbers to help the loader
|
||||
.word _magic_start @ absolute load/run zImage address
|
||||
.word _magic_end @ zImage end address
|
||||
.word 0x04030201 @ endianness flag
|
||||
|
||||
THUMB( .thumb )
|
||||
1: __EFI_HEADER
|
||||
|
||||
__EFI_HEADER
|
||||
1:
|
||||
ARM_BE8( setend be ) @ go BE8 if compiled for BE8
|
||||
AR_CLASS( mrs r9, cpsr )
|
||||
#ifdef CONFIG_ARM_VIRT_EXT
|
||||
|
@ -1382,7 +1385,21 @@ ENTRY(efi_stub_entry)
|
|||
|
||||
@ Preserve return value of efi_entry() in r4
|
||||
mov r4, r0
|
||||
bl cache_clean_flush
|
||||
|
||||
@ our cache maintenance code relies on CP15 barrier instructions
|
||||
@ but since we arrived here with the MMU and caches configured
|
||||
@ by UEFI, we must check that the CP15BEN bit is set in SCTLR.
|
||||
@ Note that this bit is RAO/WI on v6 and earlier, so the ISB in
|
||||
@ the enable path will be executed on v7+ only.
|
||||
mrc p15, 0, r1, c1, c0, 0 @ read SCTLR
|
||||
tst r1, #(1 << 5) @ CP15BEN bit set?
|
||||
bne 0f
|
||||
orr r1, r1, #(1 << 5) @ CP15 barrier instructions
|
||||
mcr p15, 0, r1, c1, c0, 0 @ write SCTLR
|
||||
ARM( .inst 0xf57ff06f @ v7+ isb )
|
||||
THUMB( isb )
|
||||
|
||||
0: bl cache_clean_flush
|
||||
bl cache_off
|
||||
|
||||
@ Set parameters for booting zImage according to boot protocol
|
||||
|
|
|
@ -1,10 +1,14 @@
|
|||
#ifndef _ARM_LIBFDT_ENV_H
|
||||
#define _ARM_LIBFDT_ENV_H
|
||||
|
||||
#include <linux/limits.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/string.h>
|
||||
#include <asm/byteorder.h>
|
||||
|
||||
#define INT32_MAX S32_MAX
|
||||
#define UINT32_MAX U32_MAX
|
||||
|
||||
typedef __be16 fdt16_t;
|
||||
typedef __be32 fdt32_t;
|
||||
typedef __be64 fdt64_t;
|
||||
|
|
|
@ -701,6 +701,7 @@
|
|||
pinctrl-0 = <&cpsw_default>;
|
||||
pinctrl-1 = <&cpsw_sleep>;
|
||||
status = "okay";
|
||||
slaves = <1>;
|
||||
};
|
||||
|
||||
&davinci_mdio {
|
||||
|
@ -708,15 +709,14 @@
|
|||
pinctrl-0 = <&davinci_mdio_default>;
|
||||
pinctrl-1 = <&davinci_mdio_sleep>;
|
||||
status = "okay";
|
||||
|
||||
ethphy0: ethernet-phy@0 {
|
||||
reg = <0>;
|
||||
};
|
||||
};
|
||||
|
||||
&cpsw_emac0 {
|
||||
phy_id = <&davinci_mdio>, <0>;
|
||||
phy-mode = "rgmii-txid";
|
||||
};
|
||||
|
||||
&cpsw_emac1 {
|
||||
phy_id = <&davinci_mdio>, <1>;
|
||||
phy-handle = <ðphy0>;
|
||||
phy-mode = "rgmii-txid";
|
||||
};
|
||||
|
||||
|
|
|
@ -74,6 +74,11 @@
|
|||
};
|
||||
};
|
||||
|
||||
/* Table Table 5-79 of the TRM shows 480ab000 is reserved */
|
||||
&usb_otg_hs {
|
||||
status = "disabled";
|
||||
};
|
||||
|
||||
&iva {
|
||||
status = "disabled";
|
||||
};
|
||||
|
|
|
@ -1117,6 +1117,8 @@
|
|||
ti,hwmods = "dss_dispc";
|
||||
clocks = <&disp_clk>;
|
||||
clock-names = "fck";
|
||||
|
||||
max-memory-bandwidth = <230000000>;
|
||||
};
|
||||
|
||||
rfbi: rfbi@4832a800 {
|
||||
|
|
|
@ -79,7 +79,7 @@
|
|||
};
|
||||
|
||||
lcd0: display {
|
||||
compatible = "osddisplays,osd057T0559-34ts", "panel-dpi";
|
||||
compatible = "osddisplays,osd070t1718-19ts", "panel-dpi";
|
||||
label = "lcd";
|
||||
|
||||
panel-timing {
|
||||
|
|
|
@ -533,6 +533,8 @@
|
|||
|
||||
touchscreen-size-x = <480>;
|
||||
touchscreen-size-y = <272>;
|
||||
|
||||
wakeup-source;
|
||||
};
|
||||
|
||||
tlv320aic3106: tlv320aic3106@1b {
|
||||
|
|
|
@ -41,7 +41,7 @@
|
|||
};
|
||||
|
||||
lcd0: display {
|
||||
compatible = "osddisplays,osd057T0559-34ts", "panel-dpi";
|
||||
compatible = "osddisplays,osd070t1718-19ts", "panel-dpi";
|
||||
label = "lcd";
|
||||
|
||||
panel-timing {
|
||||
|
|
|
@ -334,7 +334,7 @@
|
|||
clock-names = "uartclk", "apb_pclk";
|
||||
};
|
||||
|
||||
ssp: ssp@1000d000 {
|
||||
ssp: spi@1000d000 {
|
||||
compatible = "arm,pl022", "arm,primecell";
|
||||
reg = <0x1000d000 0x1000>;
|
||||
clocks = <&sspclk>, <&pclk>;
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue