Merge 4.9.212 branch 'android-4.9-q' into tw10-android-4.9-q

Documentation/filesystems/fscrypt.rst
	arch/arm/common/Kconfig
	arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi
	arch/arm64/boot/dts/amd/amd-seattle-soc.dtsi
	arch/arm64/boot/dts/arm/juno-clocks.dtsi
	arch/arm64/boot/dts/broadcom/ns2.dtsi
	arch/arm64/boot/dts/lg/lg1312.dtsi
	arch/arm64/boot/dts/lg/lg1313.dtsi
	arch/arm64/boot/dts/marvell/armada-37xx.dtsi
	arch/arm64/boot/dts/nvidia/tegra210-p2180.dtsi
	arch/arm64/boot/dts/nvidia/tegra210-p2597.dtsi
	arch/arm64/boot/dts/nvidia/tegra210.dtsi
	arch/arm64/boot/dts/qcom/apq8016-sbc.dtsi
	arch/arm64/boot/dts/qcom/msm8996.dtsi
	arch/arm64/configs/ranchu64_defconfig
	arch/arm64/include/asm/cpucaps.h
	arch/arm64/kernel/cpufeature.c
	arch/arm64/kernel/traps.c
	arch/arm64/mm/mmu.c
	crypto/Makefile
	crypto/ablkcipher.c
	crypto/blkcipher.c
	crypto/testmgr.h
	crypto/zstd.c
	drivers/android/binder.c
	drivers/android/binder_alloc.c
	drivers/char/random.c
	drivers/clocksource/exynos_mct.c
	drivers/dma/pl330.c
	drivers/hid/hid-sony.c
	drivers/hid/uhid.c
	drivers/hid/usbhid/hiddev.c
	drivers/i2c/i2c-core.c
	drivers/md/dm-crypt.c
	drivers/media/v4l2-core/videobuf2-v4l2.c
	drivers/mmc/host/dw_mmc.c
	drivers/net/ethernet/broadcom/tg3.c
	drivers/net/usb/r8152.c
	drivers/scsi/scsi_logging.c
	drivers/scsi/sd.c
	drivers/scsi/ufs/ufshcd-pci.c
	drivers/scsi/ufs/ufshcd-pltfrm.c
	drivers/staging/android/Kconfig
	drivers/staging/android/ion/ion.c
	drivers/staging/android/ion/ion_priv.h
	drivers/staging/android/ion/ion_system_heap.c
	drivers/staging/android/lowmemorykiller.c
	drivers/tty/serial/samsung.c
	drivers/usb/dwc3/core.c
	drivers/usb/dwc3/gadget.c
	drivers/usb/host/xhci-hub.c
	drivers/video/fbdev/core/fbmon.c
	drivers/video/fbdev/core/modedb.c
	fs/crypto/fname.c
	fs/crypto/fscrypt_private.h
	fs/crypto/keyinfo.c
	fs/ext4/ialloc.c
	fs/ext4/namei.c
	fs/ext4/xattr.c
	fs/f2fs/checkpoint.c
	fs/f2fs/data.c
	fs/f2fs/debug.c
	fs/f2fs/dir.c
	fs/f2fs/f2fs.h
	fs/f2fs/file.c
	fs/f2fs/gc.c
	fs/f2fs/inline.c
	fs/f2fs/inode.c
	fs/f2fs/namei.c
	fs/f2fs/node.c
	fs/f2fs/recovery.c
	fs/f2fs/segment.c
	fs/f2fs/segment.h
	fs/f2fs/super.c
	fs/f2fs/sysfs.c
	fs/fat/dir.c
	fs/fat/fatent.c
	fs/file.c
	fs/namespace.c
	fs/pnode.c
	fs/proc/inode.c
	fs/proc/root.c
	fs/proc/task_mmu.c
	fs/sdcardfs/dentry.c
	fs/sdcardfs/derived_perm.c
	fs/sdcardfs/file.c
	fs/sdcardfs/inode.c
	fs/sdcardfs/lookup.c
	fs/sdcardfs/main.c
	fs/sdcardfs/sdcardfs.h
	fs/sdcardfs/super.c
	include/linux/blk_types.h
	include/linux/cpuhotplug.h
	include/linux/cred.h
	include/linux/fb.h
	include/linux/power_supply.h
	include/linux/sched.h
	include/linux/zstd.h
	include/trace/events/sched.h
	include/uapi/linux/android/binder.h
	init/Kconfig
	init/main.c
	kernel/bpf/hashtab.c
	kernel/cpu.c
	kernel/cred.c
	kernel/fork.c
	kernel/locking/spinlock_debug.c
	kernel/panic.c
	kernel/printk/printk.c
	kernel/sched/Makefile
	kernel/sched/core.c
	kernel/sched/fair.c
	kernel/sched/rt.c
	kernel/sched/walt.c
	kernel/sched/walt.h
	kernel/trace/trace.c
	lib/bug.c
	lib/list_debug.c
	lib/vsprintf.c
	lib/zstd/bitstream.h
	lib/zstd/compress.c
	lib/zstd/decompress.c
	lib/zstd/fse.h
	lib/zstd/fse_compress.c
	lib/zstd/fse_decompress.c
	lib/zstd/huf_compress.c
	lib/zstd/huf_decompress.c
	lib/zstd/zstd_internal.h
	mm/debug.c
	mm/filemap.c
	mm/rmap.c
	net/core/filter.c
	net/ipv4/sysctl_net_ipv4.c
	net/ipv4/sysfs_net_ipv4.c
	net/ipv4/tcp_input.c
	net/ipv4/tcp_output.c
	net/ipv4/udp.c
	net/ipv6/netfilter/nf_conntrack_reasm.c
	net/netfilter/Kconfig
	net/netfilter/Makefile
	net/netfilter/xt_qtaguid.c
	net/netfilter/xt_qtaguid_internal.h
	net/xfrm/xfrm_policy.c
	net/xfrm/xfrm_state.c
	scripts/checkpatch.pl
	security/selinux/hooks.c
	sound/core/compress_offload.c
This commit is contained in:
FAROVITUS 2020-02-09 15:34:27 +02:00
commit af1d3ae977
4823 changed files with 128774 additions and 44493 deletions

View file

@ -1,119 +0,0 @@
What: /sys/block/zram<id>/num_reads
Date: August 2015
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Description:
The num_reads file is read-only and specifies the number of
reads (failed or successful) done on this device.
Now accessible via zram<id>/stat node.
What: /sys/block/zram<id>/num_writes
Date: August 2015
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Description:
The num_writes file is read-only and specifies the number of
writes (failed or successful) done on this device.
Now accessible via zram<id>/stat node.
What: /sys/block/zram<id>/invalid_io
Date: August 2015
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Description:
The invalid_io file is read-only and specifies the number of
non-page-size-aligned I/O requests issued to this device.
Now accessible via zram<id>/io_stat node.
What: /sys/block/zram<id>/failed_reads
Date: August 2015
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Description:
The failed_reads file is read-only and specifies the number of
failed reads happened on this device.
Now accessible via zram<id>/io_stat node.
What: /sys/block/zram<id>/failed_writes
Date: August 2015
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Description:
The failed_writes file is read-only and specifies the number of
failed writes happened on this device.
Now accessible via zram<id>/io_stat node.
What: /sys/block/zram<id>/notify_free
Date: August 2015
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Description:
The notify_free file is read-only. Depending on device usage
scenario it may account a) the number of pages freed because
of swap slot free notifications or b) the number of pages freed
because of REQ_DISCARD requests sent by bio. The former ones
are sent to a swap block device when a swap slot is freed, which
implies that this disk is being used as a swap disk. The latter
ones are sent by filesystem mounted with discard option,
whenever some data blocks are getting discarded.
Now accessible via zram<id>/io_stat node.
What: /sys/block/zram<id>/zero_pages
Date: August 2015
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Description:
The zero_pages file is read-only and specifies number of zero
filled pages written to this disk. No memory is allocated for
such pages.
Now accessible via zram<id>/mm_stat node.
What: /sys/block/zram<id>/orig_data_size
Date: August 2015
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Description:
The orig_data_size file is read-only and specifies uncompressed
size of data stored in this disk. This excludes zero-filled
pages (zero_pages) since no memory is allocated for them.
Unit: bytes
Now accessible via zram<id>/mm_stat node.
What: /sys/block/zram<id>/compr_data_size
Date: August 2015
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Description:
The compr_data_size file is read-only and specifies compressed
size of data stored in this disk. So, compression ratio can be
calculated using orig_data_size and this statistic.
Unit: bytes
Now accessible via zram<id>/mm_stat node.
What: /sys/block/zram<id>/mem_used_total
Date: August 2015
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Description:
The mem_used_total file is read-only and specifies the amount
of memory, including allocator fragmentation and metadata
overhead, allocated for this disk. So, allocator space
efficiency can be calculated using compr_data_size and this
statistic.
Unit: bytes
Now accessible via zram<id>/mm_stat node.
What: /sys/block/zram<id>/mem_used_max
Date: August 2015
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Description:
The mem_used_max file is read/write and specifies the amount
of maximum memory zram have consumed to store compressed data.
For resetting the value, you should write "0". Otherwise,
you could see -EINVAL.
Unit: bytes
Downgraded to write-only node: so it's possible to set new
value only; its current value is stored in zram<id>/mm_stat
node.
What: /sys/block/zram<id>/mem_limit
Date: August 2015
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Description:
The mem_limit file is read/write and specifies the maximum
amount of memory ZRAM can use to store the compressed data.
The limit could be changed in run time and "0" means disable
the limit. No limit is the initial state. Unit: bytes
Downgraded to write-only node: so it's possible to set new
value only; its current value is stored in zram<id>/mm_stat
node.

View file

@ -22,41 +22,6 @@ Description:
device. The reset operation frees all the memory associated
with this device.
What: /sys/block/zram<id>/num_reads
Date: August 2010
Contact: Nitin Gupta <ngupta@vflare.org>
Description:
The num_reads file is read-only and specifies the number of
reads (failed or successful) done on this device.
What: /sys/block/zram<id>/num_writes
Date: August 2010
Contact: Nitin Gupta <ngupta@vflare.org>
Description:
The num_writes file is read-only and specifies the number of
writes (failed or successful) done on this device.
What: /sys/block/zram<id>/invalid_io
Date: August 2010
Contact: Nitin Gupta <ngupta@vflare.org>
Description:
The invalid_io file is read-only and specifies the number of
non-page-size-aligned I/O requests issued to this device.
What: /sys/block/zram<id>/failed_reads
Date: February 2014
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Description:
The failed_reads file is read-only and specifies the number of
failed reads happened on this device.
What: /sys/block/zram<id>/failed_writes
Date: February 2014
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Description:
The failed_writes file is read-only and specifies the number of
failed writes happened on this device.
What: /sys/block/zram<id>/max_comp_streams
Date: February 2014
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
@ -73,74 +38,24 @@ Description:
available and selected compression algorithms, change
compression algorithm selection.
What: /sys/block/zram<id>/notify_free
Date: August 2010
Contact: Nitin Gupta <ngupta@vflare.org>
Description:
The notify_free file is read-only. Depending on device usage
scenario it may account a) the number of pages freed because
of swap slot free notifications or b) the number of pages freed
because of REQ_DISCARD requests sent by bio. The former ones
are sent to a swap block device when a swap slot is freed, which
implies that this disk is being used as a swap disk. The latter
ones are sent by filesystem mounted with discard option,
whenever some data blocks are getting discarded.
What: /sys/block/zram<id>/zero_pages
Date: August 2010
Contact: Nitin Gupta <ngupta@vflare.org>
Description:
The zero_pages file is read-only and specifies number of zero
filled pages written to this disk. No memory is allocated for
such pages.
What: /sys/block/zram<id>/orig_data_size
Date: August 2010
Contact: Nitin Gupta <ngupta@vflare.org>
Description:
The orig_data_size file is read-only and specifies uncompressed
size of data stored in this disk. This excludes zero-filled
pages (zero_pages) since no memory is allocated for them.
Unit: bytes
What: /sys/block/zram<id>/compr_data_size
Date: August 2010
Contact: Nitin Gupta <ngupta@vflare.org>
Description:
The compr_data_size file is read-only and specifies compressed
size of data stored in this disk. So, compression ratio can be
calculated using orig_data_size and this statistic.
Unit: bytes
What: /sys/block/zram<id>/mem_used_total
Date: August 2010
Contact: Nitin Gupta <ngupta@vflare.org>
Description:
The mem_used_total file is read-only and specifies the amount
of memory, including allocator fragmentation and metadata
overhead, allocated for this disk. So, allocator space
efficiency can be calculated using compr_data_size and this
statistic.
Unit: bytes
What: /sys/block/zram<id>/mem_used_max
Date: August 2014
Contact: Minchan Kim <minchan@kernel.org>
Description:
The mem_used_max file is read/write and specifies the amount
of maximum memory zram have consumed to store compressed data.
For resetting the value, you should write "0". Otherwise,
you could see -EINVAL.
The mem_used_max file is write-only and is used to reset
the counter of maximum memory zram have consumed to store
compressed data. For resetting the value, you should write
"0". Otherwise, you could see -EINVAL.
Unit: bytes
What: /sys/block/zram<id>/mem_limit
Date: August 2014
Contact: Minchan Kim <minchan@kernel.org>
Description:
The mem_limit file is read/write and specifies the maximum
amount of memory ZRAM can use to store the compressed data. The
limit could be changed in run time and "0" means disable the
limit. No limit is the initial state. Unit: bytes
The mem_limit file is write-only and specifies the maximum
amount of memory ZRAM can use to store the compressed data.
The limit could be changed in run time and "0" means disable
the limit. No limit is the initial state. Unit: bytes
What: /sys/block/zram<id>/compact
Date: August 2015
@ -175,3 +90,50 @@ Description:
device's debugging info useful for kernel developers. Its
format is not documented intentionally and may change
anytime without any notice.
What: /sys/block/zram<id>/backing_dev
Date: June 2017
Contact: Minchan Kim <minchan@kernel.org>
Description:
The backing_dev file is read-write and set up backing
device for zram to write incompressible pages.
For using, user should enable CONFIG_ZRAM_WRITEBACK.
What: /sys/block/zram<id>/idle
Date: November 2018
Contact: Minchan Kim <minchan@kernel.org>
Description:
idle file is write-only and mark zram slot as idle.
If system has mounted debugfs, user can see which slots
are idle via /sys/kernel/debug/zram/zram<id>/block_state
What: /sys/block/zram<id>/writeback
Date: November 2018
Contact: Minchan Kim <minchan@kernel.org>
Description:
The writeback file is write-only and trigger idle and/or
huge page writeback to backing device.
What: /sys/block/zram<id>/bd_stat
Date: November 2018
Contact: Minchan Kim <minchan@kernel.org>
Description:
The bd_stat file is read-only and represents backing device's
statistics (bd_count, bd_reads, bd_writes) in a format
similar to block layer statistics file format.
What: /sys/block/zram<id>/writeback_limit_enable
Date: November 2018
Contact: Minchan Kim <minchan@kernel.org>
Description:
The writeback_limit_enable file is read-write and specifies
eanbe of writeback_limit feature. "1" means eable the feature.
No limit "0" is the initial state.
What: /sys/block/zram<id>/writeback_limit
Date: November 2018
Contact: Minchan Kim <minchan@kernel.org>
Description:
The writeback_limit file is read-write and specifies the maximum
amount of writeback ZRAM can do. The limit could be changed
in run time.

View file

@ -4,7 +4,7 @@ KernelVersion: 3.10
Contact: Samuel Ortiz <sameo@linux.intel.com>
linux-mei@linux.intel.com
Description: Stores the same MODALIAS value emitted by uevent
Format: mei:<mei device name>:<device uuid>:
Format: mei:<mei device name>:<device uuid>:<protocol version>
What: /sys/bus/mei/devices/.../name
Date: May 2015

View file

@ -356,6 +356,10 @@ What: /sys/devices/system/cpu/vulnerabilities
/sys/devices/system/cpu/vulnerabilities/spectre_v1
/sys/devices/system/cpu/vulnerabilities/spectre_v2
/sys/devices/system/cpu/vulnerabilities/spec_store_bypass
/sys/devices/system/cpu/vulnerabilities/l1tf
/sys/devices/system/cpu/vulnerabilities/mds
/sys/devices/system/cpu/vulnerabilities/tsx_async_abort
/sys/devices/system/cpu/vulnerabilities/itlb_multihit
Date: January 2018
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
Description: Information about CPU vulnerabilities
@ -367,3 +371,25 @@ Description: Information about CPU vulnerabilities
"Not affected" CPU is not affected by the vulnerability
"Vulnerable" CPU is affected and no mitigation in effect
"Mitigation: $M" CPU is affected and mitigation $M is in effect
See also: Documentation/hw-vuln/index.rst
What: /sys/devices/system/cpu/smt
/sys/devices/system/cpu/smt/active
/sys/devices/system/cpu/smt/control
Date: June 2018
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
Description: Control Symetric Multi Threading (SMT)
active: Tells whether SMT is active (enabled and siblings online)
control: Read/write interface to control SMT. Possible
values:
"on" SMT is enabled
"off" SMT is disabled
"forceoff" SMT is force disabled. Cannot be changed.
"notsupported" SMT is not supported by the CPU
If control status is "forceoff" or "notsupported" writes
are rejected.

View file

@ -51,6 +51,14 @@ Description:
Controls the dirty page count condition for the in-place-update
policies.
What: /sys/fs/f2fs/<disk>/min_seq_blocks
Date: August 2018
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description:
Controls the dirty page count condition for batched sequential
writes in ->writepages.
What: /sys/fs/f2fs/<disk>/min_hot_blocks
Date: March 2017
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
@ -78,12 +86,28 @@ Description:
The unit size is one block, now only support configuring in range
of [1, 512].
What: /sys/fs/f2fs/<disk>/umount_discard_timeout
Date: January 2019
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description:
Set timeout to issue discard commands during umount.
Default: 5 secs
What: /sys/fs/f2fs/<disk>/max_victim_search
Date: January 2014
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
Description:
Controls the number of trials to find a victim segment.
What: /sys/fs/f2fs/<disk>/migration_granularity
Date: October 2018
Contact: "Chao Yu" <yuchao0@huawei.com>
Description:
Controls migration granularity of garbage collection on large
section, it can let GC move partial segment{s} of one section
in one GC cycle, so that dispersing heavy overhead GC to
multiple lightweight one.
What: /sys/fs/f2fs/<disk>/dir_level
Date: March 2014
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
@ -113,7 +137,22 @@ What: /sys/fs/f2fs/<disk>/idle_interval
Date: January 2016
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description:
Controls the idle timing.
Controls the idle timing for all paths other than
discard and gc path.
What: /sys/fs/f2fs/<disk>/discard_idle_interval
Date: September 2018
Contact: "Chao Yu" <yuchao0@huawei.com>
Contact: "Sahitya Tummala" <stummala@codeaurora.org>
Description:
Controls the idle timing for discard path.
What: /sys/fs/f2fs/<disk>/gc_idle_interval
Date: September 2018
Contact: "Chao Yu" <yuchao0@huawei.com>
Contact: "Sahitya Tummala" <stummala@codeaurora.org>
Description:
Controls the idle timing for gc path.
What: /sys/fs/f2fs/<disk>/iostat_enable
Date: August 2017

View file

@ -33,7 +33,7 @@ GNU C 3.2 gcc --version
GNU make 3.80 make --version
binutils 2.12 ld -v
util-linux 2.10o fdformat --version
module-init-tools 0.9.10 depmod -V
kmod 13 depmod -V
e2fsprogs 1.41.4 e2fsck -V
jfsutils 1.1.3 fsck.jfs -V
reiserfsprogs 3.6.3 reiserfsck -V
@ -143,12 +143,6 @@ is not build with ``CONFIG_KALLSYMS`` and you have no way to rebuild and
reproduce the Oops with that option, then you can still decode that Oops
with ksymoops.
Module-Init-Tools
-----------------
A new module loader is now in the kernel that requires ``module-init-tools``
to use. It is backward compatible with the 2.4.x series kernels.
Mkinitrd
--------
@ -363,16 +357,17 @@ Util-linux
- <ftp://ftp.kernel.org/pub/linux/utils/util-linux/>
Kmod
----
- <https://www.kernel.org/pub/linux/utils/kernel/kmod/>
- <https://git.kernel.org/pub/scm/utils/kernel/kmod/kmod.git>
Ksymoops
--------
- <ftp://ftp.kernel.org/pub/linux/utils/kernel/ksymoops/v2.4/>
Module-Init-Tools
-----------------
- <ftp://ftp.kernel.org/pub/linux/kernel/people/rusty/modules/>
Mkinitrd
--------

View file

@ -0,0 +1,180 @@
================================
PSI - Pressure Stall Information
================================
:Date: April, 2018
:Author: Johannes Weiner <hannes@cmpxchg.org>
When CPU, memory or IO devices are contended, workloads experience
latency spikes, throughput losses, and run the risk of OOM kills.
Without an accurate measure of such contention, users are forced to
either play it safe and under-utilize their hardware resources, or
roll the dice and frequently suffer the disruptions resulting from
excessive overcommit.
The psi feature identifies and quantifies the disruptions caused by
such resource crunches and the time impact it has on complex workloads
or even entire systems.
Having an accurate measure of productivity losses caused by resource
scarcity aids users in sizing workloads to hardware--or provisioning
hardware according to workload demand.
As psi aggregates this information in realtime, systems can be managed
dynamically using techniques such as load shedding, migrating jobs to
other systems or data centers, or strategically pausing or killing low
priority or restartable batch jobs.
This allows maximizing hardware utilization without sacrificing
workload health or risking major disruptions such as OOM kills.
Pressure interface
==================
Pressure information for each resource is exported through the
respective file in /proc/pressure/ -- cpu, memory, and io.
The format for CPU is as such:
some avg10=0.00 avg60=0.00 avg300=0.00 total=0
and for memory and IO:
some avg10=0.00 avg60=0.00 avg300=0.00 total=0
full avg10=0.00 avg60=0.00 avg300=0.00 total=0
The "some" line indicates the share of time in which at least some
tasks are stalled on a given resource.
The "full" line indicates the share of time in which all non-idle
tasks are stalled on a given resource simultaneously. In this state
actual CPU cycles are going to waste, and a workload that spends
extended time in this state is considered to be thrashing. This has
severe impact on performance, and it's useful to distinguish this
situation from a state where some tasks are stalled but the CPU is
still doing productive work. As such, time spent in this subset of the
stall state is tracked separately and exported in the "full" averages.
The ratios are tracked as recent trends over ten, sixty, and three
hundred second windows, which gives insight into short term events as
well as medium and long term trends. The total absolute stall time is
tracked and exported as well, to allow detection of latency spikes
which wouldn't necessarily make a dent in the time averages, or to
average trends over custom time frames.
Monitoring for pressure thresholds
==================================
Users can register triggers and use poll() to be woken up when resource
pressure exceeds certain thresholds.
A trigger describes the maximum cumulative stall time over a specific
time window, e.g. 100ms of total stall time within any 500ms window to
generate a wakeup event.
To register a trigger user has to open psi interface file under
/proc/pressure/ representing the resource to be monitored and write the
desired threshold and time window. The open file descriptor should be
used to wait for trigger events using select(), poll() or epoll().
The following format is used:
<some|full> <stall amount in us> <time window in us>
For example writing "some 150000 1000000" into /proc/pressure/memory
would add 150ms threshold for partial memory stall measured within
1sec time window. Writing "full 50000 1000000" into /proc/pressure/io
would add 50ms threshold for full io stall measured within 1sec time window.
Triggers can be set on more than one psi metric and more than one trigger
for the same psi metric can be specified. However for each trigger a separate
file descriptor is required to be able to poll it separately from others,
therefore for each trigger a separate open() syscall should be made even
when opening the same psi interface file.
Monitors activate only when system enters stall state for the monitored
psi metric and deactivates upon exit from the stall state. While system is
in the stall state psi signal growth is monitored at a rate of 10 times per
tracking window.
The kernel accepts window sizes ranging from 500ms to 10s, therefore min
monitoring update interval is 50ms and max is 1s. Min limit is set to
prevent overly frequent polling. Max limit is chosen as a high enough number
after which monitors are most likely not needed and psi averages can be used
instead.
When activated, psi monitor stays active for at least the duration of one
tracking window to avoid repeated activations/deactivations when system is
bouncing in and out of the stall state.
Notifications to the userspace are rate-limited to one per tracking window.
The trigger will de-register when the file descriptor used to define the
trigger is closed.
Userspace monitor usage example
===============================
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <poll.h>
#include <string.h>
#include <unistd.h>
/*
* Monitor memory partial stall with 1s tracking window size
* and 150ms threshold.
*/
int main() {
const char trig[] = "some 150000 1000000";
struct pollfd fds;
int n;
fds.fd = open("/proc/pressure/memory", O_RDWR | O_NONBLOCK);
if (fds.fd < 0) {
printf("/proc/pressure/memory open error: %s\n",
strerror(errno));
return 1;
}
fds.events = POLLPRI;
if (write(fds.fd, trig, strlen(trig) + 1) < 0) {
printf("/proc/pressure/memory write error: %s\n",
strerror(errno));
return 1;
}
printf("waiting for events...\n");
while (1) {
n = poll(&fds, 1, -1);
if (n < 0) {
printf("poll error: %s\n", strerror(errno));
return 1;
}
if (fds.revents & POLLERR) {
printf("got POLLERR, event source is gone\n");
return 0;
}
if (fds.revents & POLLPRI) {
printf("event triggered!\n");
} else {
printf("unknown event received: 0x%x\n", fds.revents);
return 1;
}
}
return 0;
}
Cgroup2 interface
=================
In a system with a CONFIG_CGROUP=y kernel and the cgroup2 filesystem
mounted, pressure stall information is also tracked for tasks grouped
into cgroups. Each subdirectory in the cgroupfs mountpoint contains
cpu.pressure, memory.pressure, and io.pressure files; the format is
the same as the /proc/pressure/ files.
Per-cgroup psi monitors can be specified and used the same way as
system-wide ones.

View file

@ -6,7 +6,7 @@ TL;DR summary
* Use only NEON instructions, or VFP instructions that don't rely on support
code
* Isolate your NEON code in a separate compilation unit, and compile it with
'-mfpu=neon -mfloat-abi=softfp'
'-march=armv7-a -mfpu=neon -mfloat-abi=softfp'
* Put kernel_neon_begin() and kernel_neon_end() calls around the calls into your
NEON code
* Don't sleep in your NEON code, and be aware that it will be executed with
@ -87,7 +87,7 @@ instructions appearing in unexpected places if no special care is taken.
Therefore, the recommended and only supported way of using NEON/VFP in the
kernel is by adhering to the following rules:
* isolate the NEON code in a separate compilation unit and compile it with
'-mfpu=neon -mfloat-abi=softfp';
'-march=armv7-a -mfpu=neon -mfloat-abi=softfp';
* issue the calls to kernel_neon_begin(), kernel_neon_end() as well as the calls
into the unit containing the NEON code from a compilation unit which is *not*
built with the GCC flag '-mfpu=neon' set.

View file

@ -156,47 +156,24 @@ Per-device statistics are exported as various nodes under /sys/block/zram<id>/
A brief description of exported device attributes. For more details please
read Documentation/ABI/testing/sysfs-block-zram.
Name access description
---- ------ -----------
disksize RW show and set the device's disk size
initstate RO shows the initialization state of the device
reset WO trigger device reset
num_reads RO the number of reads
failed_reads RO the number of failed reads
num_write RO the number of writes
failed_writes RO the number of failed writes
invalid_io RO the number of non-page-size-aligned I/O requests
max_comp_streams RW the number of possible concurrent compress operations
comp_algorithm RW show and change the compression algorithm
notify_free RO the number of notifications to free pages (either
slot free notifications or REQ_DISCARD requests)
zero_pages RO the number of zero filled pages written to this disk
orig_data_size RO uncompressed size of data stored in this disk
compr_data_size RO compressed size of data stored in this disk
mem_used_total RO the amount of memory allocated for this disk
mem_used_max RW the maximum amount of memory zram have consumed to
store the data (to reset this counter to the actual
current value, write 1 to this attribute)
mem_limit RW the maximum amount of memory ZRAM can use to store
the compressed data
pages_compacted RO the number of pages freed during compaction
(available only via zram<id>/mm_stat node)
compact WO trigger memory compaction
debug_stat RO this file is used for zram debugging purposes
Name access description
---- ------ -----------
disksize RW show and set the device's disk size
initstate RO shows the initialization state of the device
reset WO trigger device reset
mem_used_max WO reset the `mem_used_max' counter (see later)
mem_limit WO specifies the maximum amount of memory ZRAM can use
to store the compressed data
writeback_limit WO specifies the maximum amount of write IO zram can
write out to backing device as 4KB unit
writeback_limit_enable RW show and set writeback_limit feature
max_comp_streams RW the number of possible concurrent compress operations
comp_algorithm RW show and change the compression algorithm
compact WO trigger memory compaction
debug_stat RO this file is used for zram debugging purposes
backing_dev RW set up backend storage for zram to write out
idle WO mark allocated slot as idle
WARNING
=======
per-stat sysfs attributes are considered to be deprecated.
The basic strategy is:
-- the existing RW nodes will be downgraded to WO nodes (in linux 4.11)
-- deprecated RO sysfs nodes will eventually be removed (in linux 4.11)
The list of deprecated attributes can be found here:
Documentation/ABI/obsolete/sysfs-block-zram
Basically, every attribute that has its own read accessible sysfs node
(e.g. num_reads) *AND* is accessible via one of the stat files (zram<id>/stat
or zram<id>/io_stat or zram<id>/mm_stat) is considered to be deprecated.
User space is advised to use the following files to read the device statistics.
@ -211,22 +188,52 @@ The stat file represents device's I/O statistics not accounted by block
layer and, thus, not available in zram<id>/stat file. It consists of a
single line of text and contains the following stats separated by
whitespace:
failed_reads
failed_writes
invalid_io
notify_free
failed_reads the number of failed reads
failed_writes the number of failed writes
invalid_io the number of non-page-size-aligned I/O requests
notify_free Depending on device usage scenario it may account
a) the number of pages freed because of swap slot free
notifications or b) the number of pages freed because of
REQ_DISCARD requests sent by bio. The former ones are
sent to a swap block device when a swap slot is freed,
which implies that this disk is being used as a swap disk.
The latter ones are sent by filesystem mounted with
discard option, whenever some data blocks are getting
discarded.
File /sys/block/zram<id>/mm_stat
The stat file represents device's mm statistics. It consists of a single
line of text and contains the following stats separated by whitespace:
orig_data_size
compr_data_size
mem_used_total
mem_limit
mem_used_max
zero_pages
num_migrated
orig_data_size uncompressed size of data stored in this disk.
This excludes same-element-filled pages (same_pages) since
no memory is allocated for them.
Unit: bytes
compr_data_size compressed size of data stored in this disk
mem_used_total the amount of memory allocated for this disk. This
includes allocator fragmentation and metadata overhead,
allocated for this disk. So, allocator space efficiency
can be calculated using compr_data_size and this statistic.
Unit: bytes
mem_limit the maximum amount of memory ZRAM can use to store
the compressed data
mem_used_max the maximum amount of memory zram have consumed to
store the data
same_pages the number of same element filled pages written to this disk.
No memory is allocated for such pages.
pages_compacted the number of pages freed during compaction
huge_pages the number of incompressible pages
File /sys/block/zram<id>/bd_stat
The stat file represents device's backing device statistics. It consists of
a single line of text and contains the following stats separated by whitespace:
bd_count size of data written in backing device.
Unit: 4K bytes
bd_reads the number of reads from backing device
Unit: 4K bytes
bd_writes the number of writes to backing device
Unit: 4K bytes
9) Deactivate:
swapoff /dev/zram0
@ -241,5 +248,108 @@ line of text and contains the following stats separated by whitespace:
resets the disksize to zero. You must set the disksize again
before reusing the device.
* Optional Feature
= writeback
With CONFIG_ZRAM_WRITEBACK, zram can write idle/incompressible page
to backing storage rather than keeping it in memory.
To use the feature, admin should set up backing device via
"echo /dev/sda5 > /sys/block/zramX/backing_dev"
before disksize setting. It supports only partition at this moment.
If admin want to use incompressible page writeback, they could do via
"echo huge > /sys/block/zramX/write"
To use idle page writeback, first, user need to declare zram pages
as idle.
"echo all > /sys/block/zramX/idle"
From now on, any pages on zram are idle pages. The idle mark
will be removed until someone request access of the block.
IOW, unless there is access request, those pages are still idle pages.
Admin can request writeback of those idle pages at right timing via
"echo idle > /sys/block/zramX/writeback"
With the command, zram writeback idle pages from memory to the storage.
If there are lots of write IO with flash device, potentially, it has
flash wearout problem so that admin needs to design write limitation
to guarantee storage health for entire product life.
To overcome the concern, zram supports "writeback_limit" feature.
The "writeback_limit_enable"'s default value is 0 so that it doesn't limit
any writeback. IOW, if admin want to apply writeback budget, he should
enable writeback_limit_enable via
$ echo 1 > /sys/block/zramX/writeback_limit_enable
Once writeback_limit_enable is set, zram doesn't allow any writeback
until admin set the budget via /sys/block/zramX/writeback_limit.
(If admin doesn't enable writeback_limit_enable, writeback_limit's value
assigned via /sys/block/zramX/writeback_limit is meaninless.)
If admin want to limit writeback as per-day 400M, he could do it
like below.
$ MB_SHIFT=20
$ 4K_SHIFT=12
$ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
/sys/block/zram0/writeback_limit.
$ echo 1 > /sys/block/zram0/writeback_limit_enable
If admin want to allow further write again once the bugdet is exausted,
he could do it like below
$ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
/sys/block/zram0/writeback_limit
If admin want to see remaining writeback budget since he set,
$ cat /sys/block/zramX/writeback_limit
If admin want to disable writeback limit, he could do
$ echo 0 > /sys/block/zramX/writeback_limit_enable
The writeback_limit count will reset whenever you reset zram(e.g.,
system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of
writeback happened until you reset the zram to allocate extra writeback
budget in next setting is user's job.
If admin want to measure writeback count in a certain period, he could
know it via /sys/block/zram0/bd_stat's 3rd column.
= memory tracking
With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the
zram block. It could be useful to catch cold or incompressible
pages of the process with*pagemap.
If you enable the feature, you could see block state via
/sys/kernel/debug/zram/zram0/block_state". The output is as follows,
300 75.033841 .wh.
301 63.806904 s...
302 63.806919 ..hi
First column is zram's block index.
Second column is access time since the system was booted
Third column is state of the block.
(s: same page
w: written page to backing store
h: huge page
i: idle page)
First line of above example says 300th block is accessed at 75.033841sec
and the block's state is huge so it is written back to the backing
storage. It's a debugging feature so anyone shouldn't rely on it to work
properly.
Nitin Gupta
ngupta@vflare.org

View file

@ -717,6 +717,12 @@ All time durations are in microseconds.
$PERIOD duration. If only one number is written, $MAX is
updated.
cpu.pressure
A read-only nested-key file which exists on non-root cgroups.
Shows pressure stall information for CPU. See
Documentation/accounting/psi.txt for details.
5-2. Memory
@ -925,6 +931,12 @@ PAGE_SIZE multiple when read back.
Swap usage hard limit. If a cgroup's swap usage reaches this
limit, anonymous meomry of the cgroup will not be swapped out.
memory.pressure
A read-only nested-key file which exists on non-root cgroups.
Shows pressure stall information for memory. See
Documentation/accounting/psi.txt for details.
5-2-2. Usage Guidelines
@ -1055,6 +1067,12 @@ blk-mq devices.
8:16 rbps=2097152 wbps=max riops=max wiops=max
io.pressure
A read-only nested-key file which exists on non-root cgroups.
Shows pressure stall information for IO. See
Documentation/accounting/psi.txt for details.
5-3-2. Writeback

View file

@ -37,7 +37,7 @@ from load_config import loadConfig
extensions = ['kernel-doc', 'rstFlatTable', 'kernel_include', 'cdomain']
# The name of the math extension changed on Sphinx 1.4
if major == 1 and minor > 3:
if (major == 1 and minor > 3) or (major > 1):
extensions.append("sphinx.ext.imgmath")
else:
extensions.append("sphinx.ext.pngmath")

View file

@ -0,0 +1,99 @@
dm_bow (backup on write)
========================
dm_bow is a device mapper driver that uses the free space on a device to back up
data that is overwritten. The changes can then be committed by a simple state
change, or rolled back by removing the dm_bow device and running a command line
utility over the underlying device.
dm_bow has three states, set by writing 1 or 2 to /sys/block/dm-?/bow/state.
It is only possible to go from state 0 (initial state) to state 1, and then from
state 1 to state 2.
State 0: dm_bow collects all trims to the device and assumes that these mark
free space on the overlying file system that can be safely used. Typically the
mount code would create the dm_bow device, mount the file system, call the
FITRIM ioctl on the file system then switch to state 1. These trims are not
propagated to the underlying device.
State 1: All writes to the device cause the underlying data to be backed up to
the free (trimmed) area as needed in such a way as they can be restored.
However, the writes, with one exception, then happen exactly as they would
without dm_bow, so the device is always in a good final state. The exception is
that sector 0 is used to keep a log of the latest changes, both to indicate that
we are in this state and to allow rollback. See below for all details. If there
isn't enough free space, writes are failed with -ENOSPC.
State 2: The transition to state 2 triggers replacing the special sector 0 with
the normal sector 0, and the freeing of all state information. dm_bow then
becomes a pass-through driver, allowing the device to continue to be used with
minimal performance impact.
Usage
=====
dm-bow takes one command line parameter, the name of the underlying device.
dm-bow will typically be used in the following way. dm-bow will be loaded with a
suitable underlying device and the resultant device will be mounted. A file
system trim will be issued via the FITRIM ioctl, then the device will be
switched to state 1. The file system will now be used as normal. At some point,
the changes can either be committed by switching to state 2, or rolled back by
unmounting the file system, removing the dm-bow device and running the command
line utility. Note that rebooting the device will be equivalent to unmounting
and removing, but the command line utility must still be run
Details of operation in state 1
===============================
dm_bow maintains a type for all sectors. A sector can be any of:
SECTOR0
SECTOR0_CURRENT
UNCHANGED
FREE
CHANGED
BACKUP
SECTOR0 is the first sector on the device, and is used to hold the log of
changes. This is the one exception.
SECTOR0_CURRENT is a sector picked from the FREE sectors, and is where reads and
writes from the true sector zero are redirected to. Note that like any backup
sector, if the sector is written to directly, it must be moved again.
UNCHANGED means that the sector has not been changed since we entered state 1.
Thus if it is written to or trimmed, the contents must first be backed up.
FREE means that the sector was trimmed in state 0 and has not yet been written
to or used for backup. On being written to, a FREE sector is changed to CHANGED.
CHANGED means that the sector has been modified, and can be further modified
without further backup.
BACKUP means that this is a free sector being used as a backup. On being written
to, the contents must first be backed up again.
All backup operations are logged to the first sector. The log sector has the
format:
--------------------------------------------------------
| Magic | Count | Sequence | Log entry | Log entry | …
--------------------------------------------------------
Magic is a magic number. Count is the number of log entries. Sequence is 0
initially. A log entry is
-----------------------------------
| Source | Dest | Size | Checksum |
-----------------------------------
When SECTOR0 is full, the log sector is backed up and another empty log sector
created with sequence number one higher. The first entry in any log entry with
sequence > 0 therefore must be the log of the backing up of the previous log
sector. Note that sequence is not strictly needed, but is a useful sanity check
and potentially limits the time spent trying to restore a corrupted snapshot.
On entering state 1, dm_bow has a list of free sectors. All other sectors are
unchanged. Sector0_current is selected from the free sectors and the contents of
sector 0 are copied there. The sector 0 is backed up, which triggers the first
log entry to be written.

View file

@ -6,7 +6,8 @@ Required properties:
"atmel,24c00", "atmel,24c01", "atmel,24c02", "atmel,24c04",
"atmel,24c08", "atmel,24c16", "atmel,24c32", "atmel,24c64",
"atmel,24c128", "atmel,24c256", "atmel,24c512", "atmel,24c1024"
"atmel,24c128", "atmel,24c256", "atmel,24c512", "atmel,24c1024",
"atmel,24c2048"
"catalyst,24c32"
@ -17,7 +18,7 @@ Required properties:
If there is no specific driver for <manufacturer>, a generic
driver based on <type> is selected. Possible types are:
"24c00", "24c01", "24c02", "24c04", "24c08", "24c16", "24c32", "24c64",
"24c128", "24c256", "24c512", "24c1024", "spd"
"24c128", "24c256", "24c512", "24c1024", "24c2048", "spd"
- reg : the I2C address of the EEPROM

View file

@ -4,6 +4,7 @@ Required properties:
- compatible: Should be one of the following:
- "microchip,mcp2510" for MCP2510.
- "microchip,mcp2515" for MCP2515.
- "microchip,mcp25625" for MCP25625.
- reg: SPI chip select.
- clocks: The clock feeding the CAN controller.
- interrupt-parent: The parent interrupt controller.

View file

@ -10,6 +10,7 @@ Required properties:
Use "cdns,pc302-gem" for Picochip picoXcell pc302 and later devices based on
the Cadence GEM, or the generic form: "cdns,gem".
Use "atmel,sama5d2-gem" for the GEM IP (10/100) available on Atmel sama5d2 SoCs.
Use "atmel,sama5d3-macb" for the 10/100Mbit IP available on Atmel sama5d3 SoCs.
Use "atmel,sama5d3-gem" for the Gigabit IP available on Atmel sama5d3 SoCs.
Use "atmel,sama5d4-gem" for the GEM IP (10/100) available on Atmel sama5d4 SoCs.
Use "cdns,zynq-gem" Xilinx Zynq-7xxx SoC.

View file

@ -27,4 +27,4 @@ and valid to enable charging:
- "abracon,tc-diode": should be "standard" (0.6V) or "schottky" (0.3V)
- "abracon,tc-resistor": should be <0>, <3>, <6> or <11>. 0 disables the output
resistor, the other values are in ohm.
resistor, the other values are in kOhm.

View file

@ -8,6 +8,6 @@ Required properties:
Example:
serial@12000 {
compatible = "marvell,armada-3700-uart";
reg = <0x12000 0x400>;
reg = <0x12000 0x200>;
interrupts = <43>;
};

View file

@ -125,6 +125,9 @@ active_logs=%u Support configuring the number of active logs. In the
disable_ext_identify Disable the extension list configured by mkfs, so f2fs
does not aware of cold files such as media files.
inline_xattr Enable the inline xattrs feature.
noinline_xattr Disable the inline xattrs feature.
inline_xattr_size=%u Support configuring inline xattr size, it depends on
flexible inline xattr feature.
inline_data Enable the inline data feature: New created small(<~3.4k)
files can be written into inode block.
inline_dentry Enable the inline dir feature: data in new created
@ -154,6 +157,27 @@ noinline_data Disable the inline data feature, inline data feature is
enabled by default.
data_flush Enable data flushing before checkpoint in order to
persist data of regular and symlink.
fault_injection=%d Enable fault injection in all supported types with
specified injection rate.
fault_type=%d Support configuring fault injection type, should be
enabled with fault_injection option, fault type value
is shown below, it supports single or combined type.
Type_Name Type_Value
FAULT_KMALLOC 0x000000001
FAULT_KVMALLOC 0x000000002
FAULT_PAGE_ALLOC 0x000000004
FAULT_PAGE_GET 0x000000008
FAULT_ALLOC_BIO 0x000000010
FAULT_ALLOC_NID 0x000000020
FAULT_ORPHAN 0x000000040
FAULT_BLOCK 0x000000080
FAULT_DIR_DEPTH 0x000000100
FAULT_EVICT_INODE 0x000000200
FAULT_TRUNCATE 0x000000400
FAULT_READ_IO 0x000000800
FAULT_CHECKPOINT 0x000001000
FAULT_DISCARD 0x000002000
FAULT_WRITE_IO 0x000004000
mode=%s Control block allocation mode which supports "adaptive"
and "lfs". In "lfs" mode, there should be no random
writes towards main area.
@ -190,6 +214,11 @@ fsync_mode=%s Control the policy of fsync. Currently supports "posix",
non-atomic files likewise "nobarrier" mount option.
test_dummy_encryption Enable dummy encryption, which provides a fake fscrypt
context. The fake fscrypt context is used by xfstests.
checkpoint=%s Set to "disable" to turn off checkpointing. Set to "enable"
to reenable checkpointing. Is enabled by default. While
disabled, any unmounting or unexpected shutdowns will cause
the filesystem contents to appear as they did when the
filesystem was mounted with that option.
================================================================================
DEBUGFS ENTRIES

View file

@ -0,0 +1,641 @@
=====================================
Filesystem-level encryption (fscrypt)
=====================================
Introduction
============
fscrypt is a library which filesystems can hook into to support
transparent encryption of files and directories.
Note: "fscrypt" in this document refers to the kernel-level portion,
implemented in ``fs/crypto/``, as opposed to the userspace tool
`fscrypt <https://github.com/google/fscrypt>`_. This document only
covers the kernel-level portion. For command-line examples of how to
use encryption, see the documentation for the userspace tool `fscrypt
<https://github.com/google/fscrypt>`_. Also, it is recommended to use
the fscrypt userspace tool, or other existing userspace tools such as
`fscryptctl <https://github.com/google/fscryptctl>`_ or `Android's key
management system
<https://source.android.com/security/encryption/file-based>`_, over
using the kernel's API directly. Using existing tools reduces the
chance of introducing your own security bugs. (Nevertheless, for
completeness this documentation covers the kernel's API anyway.)
Unlike dm-crypt, fscrypt operates at the filesystem level rather than
at the block device level. This allows it to encrypt different files
with different keys and to have unencrypted files on the same
filesystem. This is useful for multi-user systems where each user's
data-at-rest needs to be cryptographically isolated from the others.
However, except for filenames, fscrypt does not encrypt filesystem
metadata.
Unlike eCryptfs, which is a stacked filesystem, fscrypt is integrated
directly into supported filesystems --- currently ext4, F2FS, and
UBIFS. This allows encrypted files to be read and written without
caching both the decrypted and encrypted pages in the pagecache,
thereby nearly halving the memory used and bringing it in line with
unencrypted files. Similarly, half as many dentries and inodes are
needed. eCryptfs also limits encrypted filenames to 143 bytes,
causing application compatibility issues; fscrypt allows the full 255
bytes (NAME_MAX). Finally, unlike eCryptfs, the fscrypt API can be
used by unprivileged users, with no need to mount anything.
fscrypt does not support encrypting files in-place. Instead, it
supports marking an empty directory as encrypted. Then, after
userspace provides the key, all regular files, directories, and
symbolic links created in that directory tree are transparently
encrypted.
Threat model
============
Offline attacks
---------------
Provided that userspace chooses a strong encryption key, fscrypt
protects the confidentiality of file contents and filenames in the
event of a single point-in-time permanent offline compromise of the
block device content. fscrypt does not protect the confidentiality of
non-filename metadata, e.g. file sizes, file permissions, file
timestamps, and extended attributes. Also, the existence and location
of holes (unallocated blocks which logically contain all zeroes) in
files is not protected.
fscrypt is not guaranteed to protect confidentiality or authenticity
if an attacker is able to manipulate the filesystem offline prior to
an authorized user later accessing the filesystem.
Online attacks
--------------
fscrypt (and storage encryption in general) can only provide limited
protection, if any at all, against online attacks. In detail:
fscrypt is only resistant to side-channel attacks, such as timing or
electromagnetic attacks, to the extent that the underlying Linux
Cryptographic API algorithms are. If a vulnerable algorithm is used,
such as a table-based implementation of AES, it may be possible for an
attacker to mount a side channel attack against the online system.
Side channel attacks may also be mounted against applications
consuming decrypted data.
After an encryption key has been provided, fscrypt is not designed to
hide the plaintext file contents or filenames from other users on the
same system, regardless of the visibility of the keyring key.
Instead, existing access control mechanisms such as file mode bits,
POSIX ACLs, LSMs, or mount namespaces should be used for this purpose.
Also note that as long as the encryption keys are *anywhere* in
memory, an online attacker can necessarily compromise them by mounting
a physical attack or by exploiting any kernel security vulnerability
which provides an arbitrary memory read primitive.
While it is ostensibly possible to "evict" keys from the system,
recently accessed encrypted files will remain accessible at least
until the filesystem is unmounted or the VFS caches are dropped, e.g.
using ``echo 2 > /proc/sys/vm/drop_caches``. Even after that, if the
RAM is compromised before being powered off, it will likely still be
possible to recover portions of the plaintext file contents, if not
some of the encryption keys as well. (Since Linux v4.12, all
in-kernel keys related to fscrypt are sanitized before being freed.
However, userspace would need to do its part as well.)
Currently, fscrypt does not prevent a user from maliciously providing
an incorrect key for another user's existing encrypted files. A
protection against this is planned.
Key hierarchy
=============
Master Keys
-----------
Each encrypted directory tree is protected by a *master key*. Master
keys can be up to 64 bytes long, and must be at least as long as the
greater of the key length needed by the contents and filenames
encryption modes being used. For example, if AES-256-XTS is used for
contents encryption, the master key must be 64 bytes (512 bits). Note
that the XTS mode is defined to require a key twice as long as that
required by the underlying block cipher.
To "unlock" an encrypted directory tree, userspace must provide the
appropriate master key. There can be any number of master keys, each
of which protects any number of directory trees on any number of
filesystems.
Userspace should generate master keys either using a cryptographically
secure random number generator, or by using a KDF (Key Derivation
Function). Note that whenever a KDF is used to "stretch" a
lower-entropy secret such as a passphrase, it is critical that a KDF
designed for this purpose be used, such as scrypt, PBKDF2, or Argon2.
Per-file keys
-------------
Since each master key can protect many files, it is necessary to
"tweak" the encryption of each file so that the same plaintext in two
files doesn't map to the same ciphertext, or vice versa. In most
cases, fscrypt does this by deriving per-file keys. When a new
encrypted inode (regular file, directory, or symlink) is created,
fscrypt randomly generates a 16-byte nonce and stores it in the
inode's encryption xattr. Then, it uses a KDF (Key Derivation
Function) to derive the file's key from the master key and nonce.
The Adiantum encryption mode (see `Encryption modes and usage`_) is
special, since it accepts longer IVs and is suitable for both contents
and filenames encryption. For it, a "direct key" option is offered
where the file's nonce is included in the IVs and the master key is
used for encryption directly. This improves performance; however,
users must not use the same master key for any other encryption mode.
Below, the KDF and design considerations are described in more detail.
The current KDF works by encrypting the master key with AES-128-ECB,
using the file's nonce as the AES key. The output is used as the
derived key. If the output is longer than needed, then it is
truncated to the needed length.
Note: this KDF meets the primary security requirement, which is to
produce unique derived keys that preserve the entropy of the master
key, assuming that the master key is already a good pseudorandom key.
However, it is nonstandard and has some problems such as being
reversible, so it is generally considered to be a mistake! It may be
replaced with HKDF or another more standard KDF in the future.
Key derivation was chosen over key wrapping because wrapped keys would
require larger xattrs which would be less likely to fit in-line in the
filesystem's inode table, and there didn't appear to be any
significant advantages to key wrapping. In particular, currently
there is no requirement to support unlocking a file with multiple
alternative master keys or to support rotating master keys. Instead,
the master keys may be wrapped in userspace, e.g. as is done by the
`fscrypt <https://github.com/google/fscrypt>`_ tool.
Including the inode number in the IVs was considered. However, it was
rejected as it would have prevented ext4 filesystems from being
resized, and by itself still wouldn't have been sufficient to prevent
the same key from being directly reused for both XTS and CTS-CBC.
Encryption modes and usage
==========================
fscrypt allows one encryption mode to be specified for file contents
and one encryption mode to be specified for filenames. Different
directory trees are permitted to use different encryption modes.
Currently, the following pairs of encryption modes are supported:
- AES-256-XTS for contents and AES-256-CTS-CBC for filenames
- AES-128-CBC for contents and AES-128-CTS-CBC for filenames
- Adiantum for both contents and filenames
If unsure, you should use the (AES-256-XTS, AES-256-CTS-CBC) pair.
AES-128-CBC was added only for low-powered embedded devices with
crypto accelerators such as CAAM or CESA that do not support XTS.
Adiantum is a (primarily) stream cipher-based mode that is fast even
on CPUs without dedicated crypto instructions. It's also a true
wide-block mode, unlike XTS. It can also eliminate the need to derive
per-file keys. However, it depends on the security of two primitives,
XChaCha12 and AES-256, rather than just one. See the paper
"Adiantum: length-preserving encryption for entry-level processors"
(https://eprint.iacr.org/2018/720.pdf) for more details. To use
Adiantum, CONFIG_CRYPTO_ADIANTUM must be enabled. Also, fast
implementations of ChaCha and NHPoly1305 should be enabled, e.g.
CONFIG_CRYPTO_CHACHA20_NEON and CONFIG_CRYPTO_NHPOLY1305_NEON for ARM.
New encryption modes can be added relatively easily, without changes
to individual filesystems. However, authenticated encryption (AE)
modes are not currently supported because of the difficulty of dealing
with ciphertext expansion.
Contents encryption
-------------------
For file contents, each filesystem block is encrypted independently.
Currently, only the case where the filesystem block size is equal to
the system's page size (usually 4096 bytes) is supported.
Each block's IV is set to the logical block number within the file as
a little endian number, except that:
- With CBC mode encryption, ESSIV is also used. Specifically, each IV
is encrypted with AES-256 where the AES-256 key is the SHA-256 hash
of the file's data encryption key.
- In the "direct key" configuration (FS_POLICY_FLAG_DIRECT_KEY set in
the fscrypt_policy), the file's nonce is also appended to the IV.
Currently this is only allowed with the Adiantum encryption mode.
Filenames encryption
--------------------
For filenames, each full filename is encrypted at once. Because of
the requirements to retain support for efficient directory lookups and
filenames of up to 255 bytes, the same IV is used for every filename
in a directory.
However, each encrypted directory still uses a unique key; or
alternatively (for the "direct key" configuration) has the file's
nonce included in the IVs. Thus, IV reuse is limited to within a
single directory.
With CTS-CBC, the IV reuse means that when the plaintext filenames
share a common prefix at least as long as the cipher block size (16
bytes for AES), the corresponding encrypted filenames will also share
a common prefix. This is undesirable. Adiantum does not have this
weakness, as it is a wide-block encryption mode.
All supported filenames encryption modes accept any plaintext length
>= 16 bytes; cipher block alignment is not required. However,
filenames shorter than 16 bytes are NUL-padded to 16 bytes before
being encrypted. In addition, to reduce leakage of filename lengths
via their ciphertexts, all filenames are NUL-padded to the next 4, 8,
16, or 32-byte boundary (configurable). 32 is recommended since this
provides the best confidentiality, at the cost of making directory
entries consume slightly more space. Note that since NUL (``\0``) is
not otherwise a valid character in filenames, the padding will never
produce duplicate plaintexts.
Symbolic link targets are considered a type of filename and are
encrypted in the same way as filenames in directory entries, except
that IV reuse is not a problem as each symlink has its own inode.
User API
========
Setting an encryption policy
----------------------------
The FS_IOC_SET_ENCRYPTION_POLICY ioctl sets an encryption policy on an
empty directory or verifies that a directory or regular file already
has the specified encryption policy. It takes in a pointer to a
:c:type:`struct fscrypt_policy`, defined as follows::
#define FS_KEY_DESCRIPTOR_SIZE 8
struct fscrypt_policy {
__u8 version;
__u8 contents_encryption_mode;
__u8 filenames_encryption_mode;
__u8 flags;
__u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
};
This structure must be initialized as follows:
- ``version`` must be 0.
- ``contents_encryption_mode`` and ``filenames_encryption_mode`` must
be set to constants from ``<linux/fs.h>`` which identify the
encryption modes to use. If unsure, use
FS_ENCRYPTION_MODE_AES_256_XTS (1) for ``contents_encryption_mode``
and FS_ENCRYPTION_MODE_AES_256_CTS (4) for
``filenames_encryption_mode``.
- ``flags`` must contain a value from ``<linux/fs.h>`` which
identifies the amount of NUL-padding to use when encrypting
filenames. If unsure, use FS_POLICY_FLAGS_PAD_32 (0x3).
In addition, if the chosen encryption modes are both
FS_ENCRYPTION_MODE_ADIANTUM, this can contain
FS_POLICY_FLAG_DIRECT_KEY to specify that the master key should be
used directly, without key derivation.
- ``master_key_descriptor`` specifies how to find the master key in
the keyring; see `Adding keys`_. It is up to userspace to choose a
unique ``master_key_descriptor`` for each master key. The e4crypt
and fscrypt tools use the first 8 bytes of
``SHA-512(SHA-512(master_key))``, but this particular scheme is not
required. Also, the master key need not be in the keyring yet when
FS_IOC_SET_ENCRYPTION_POLICY is executed. However, it must be added
before any files can be created in the encrypted directory.
If the file is not yet encrypted, then FS_IOC_SET_ENCRYPTION_POLICY
verifies that the file is an empty directory. If so, the specified
encryption policy is assigned to the directory, turning it into an
encrypted directory. After that, and after providing the
corresponding master key as described in `Adding keys`_, all regular
files, directories (recursively), and symlinks created in the
directory will be encrypted, inheriting the same encryption policy.
The filenames in the directory's entries will be encrypted as well.
Alternatively, if the file is already encrypted, then
FS_IOC_SET_ENCRYPTION_POLICY validates that the specified encryption
policy exactly matches the actual one. If they match, then the ioctl
returns 0. Otherwise, it fails with EEXIST. This works on both
regular files and directories, including nonempty directories.
Note that the ext4 filesystem does not allow the root directory to be
encrypted, even if it is empty. Users who want to encrypt an entire
filesystem with one key should consider using dm-crypt instead.
FS_IOC_SET_ENCRYPTION_POLICY can fail with the following errors:
- ``EACCES``: the file is not owned by the process's uid, nor does the
process have the CAP_FOWNER capability in a namespace with the file
owner's uid mapped
- ``EEXIST``: the file is already encrypted with an encryption policy
different from the one specified
- ``EINVAL``: an invalid encryption policy was specified (invalid
version, mode(s), or flags)
- ``ENOTDIR``: the file is unencrypted and is a regular file, not a
directory
- ``ENOTEMPTY``: the file is unencrypted and is a nonempty directory
- ``ENOTTY``: this type of filesystem does not implement encryption
- ``EOPNOTSUPP``: the kernel was not configured with encryption
support for this filesystem, or the filesystem superblock has not
had encryption enabled on it. (For example, to use encryption on an
ext4 filesystem, CONFIG_EXT4_ENCRYPTION must be enabled in the
kernel config, and the superblock must have had the "encrypt"
feature flag enabled using ``tune2fs -O encrypt`` or ``mkfs.ext4 -O
encrypt``.)
- ``EPERM``: this directory may not be encrypted, e.g. because it is
the root directory of an ext4 filesystem
- ``EROFS``: the filesystem is readonly
Getting an encryption policy
----------------------------
The FS_IOC_GET_ENCRYPTION_POLICY ioctl retrieves the :c:type:`struct
fscrypt_policy`, if any, for a directory or regular file. See above
for the struct definition. No additional permissions are required
beyond the ability to open the file.
FS_IOC_GET_ENCRYPTION_POLICY can fail with the following errors:
- ``EINVAL``: the file is encrypted, but it uses an unrecognized
encryption context format
- ``ENODATA``: the file is not encrypted
- ``ENOTTY``: this type of filesystem does not implement encryption
- ``EOPNOTSUPP``: the kernel was not configured with encryption
support for this filesystem
Note: if you only need to know whether a file is encrypted or not, on
most filesystems it is also possible to use the FS_IOC_GETFLAGS ioctl
and check for FS_ENCRYPT_FL, or to use the statx() system call and
check for STATX_ATTR_ENCRYPTED in stx_attributes.
Getting the per-filesystem salt
-------------------------------
Some filesystems, such as ext4 and F2FS, also support the deprecated
ioctl FS_IOC_GET_ENCRYPTION_PWSALT. This ioctl retrieves a randomly
generated 16-byte value stored in the filesystem superblock. This
value is intended to used as a salt when deriving an encryption key
from a passphrase or other low-entropy user credential.
FS_IOC_GET_ENCRYPTION_PWSALT is deprecated. Instead, prefer to
generate and manage any needed salt(s) in userspace.
Adding keys
-----------
To provide a master key, userspace must add it to an appropriate
keyring using the add_key() system call (see:
``Documentation/security/keys/core.rst``). The key type must be
"logon"; keys of this type are kept in kernel memory and cannot be
read back by userspace. The key description must be "fscrypt:"
followed by the 16-character lower case hex representation of the
``master_key_descriptor`` that was set in the encryption policy. The
key payload must conform to the following structure::
#define FS_MAX_KEY_SIZE 64
struct fscrypt_key {
u32 mode;
u8 raw[FS_MAX_KEY_SIZE];
u32 size;
};
``mode`` is ignored; just set it to 0. The actual key is provided in
``raw`` with ``size`` indicating its size in bytes. That is, the
bytes ``raw[0..size-1]`` (inclusive) are the actual key.
The key description prefix "fscrypt:" may alternatively be replaced
with a filesystem-specific prefix such as "ext4:". However, the
filesystem-specific prefixes are deprecated and should not be used in
new programs.
There are several different types of keyrings in which encryption keys
may be placed, such as a session keyring, a user session keyring, or a
user keyring. Each key must be placed in a keyring that is "attached"
to all processes that might need to access files encrypted with it, in
the sense that request_key() will find the key. Generally, if only
processes belonging to a specific user need to access a given
encrypted directory and no session keyring has been installed, then
that directory's key should be placed in that user's user session
keyring or user keyring. Otherwise, a session keyring should be
installed if needed, and the key should be linked into that session
keyring, or in a keyring linked into that session keyring.
Note: introducing the complex visibility semantics of keyrings here
was arguably a mistake --- especially given that by design, after any
process successfully opens an encrypted file (thereby setting up the
per-file key), possessing the keyring key is not actually required for
any process to read/write the file until its in-memory inode is
evicted. In the future there probably should be a way to provide keys
directly to the filesystem instead, which would make the intended
semantics clearer.
Access semantics
================
With the key
------------
With the encryption key, encrypted regular files, directories, and
symlinks behave very similarly to their unencrypted counterparts ---
after all, the encryption is intended to be transparent. However,
astute users may notice some differences in behavior:
- Unencrypted files, or files encrypted with a different encryption
policy (i.e. different key, modes, or flags), cannot be renamed or
linked into an encrypted directory; see `Encryption policy
enforcement`_. Attempts to do so will fail with EPERM. However,
encrypted files can be renamed within an encrypted directory, or
into an unencrypted directory.
- Direct I/O is not supported on encrypted files. Attempts to use
direct I/O on such files will fall back to buffered I/O.
- The fallocate operations FALLOC_FL_COLLAPSE_RANGE,
FALLOC_FL_INSERT_RANGE, and FALLOC_FL_ZERO_RANGE are not supported
on encrypted files and will fail with EOPNOTSUPP.
- Online defragmentation of encrypted files is not supported. The
EXT4_IOC_MOVE_EXT and F2FS_IOC_MOVE_RANGE ioctls will fail with
EOPNOTSUPP.
- The ext4 filesystem does not support data journaling with encrypted
regular files. It will fall back to ordered data mode instead.
- DAX (Direct Access) is not supported on encrypted files.
- The st_size of an encrypted symlink will not necessarily give the
length of the symlink target as required by POSIX. It will actually
give the length of the ciphertext, which will be slightly longer
than the plaintext due to NUL-padding and an extra 2-byte overhead.
- The maximum length of an encrypted symlink is 2 bytes shorter than
the maximum length of an unencrypted symlink. For example, on an
EXT4 filesystem with a 4K block size, unencrypted symlinks can be up
to 4095 bytes long, while encrypted symlinks can only be up to 4093
bytes long (both lengths excluding the terminating null).
Note that mmap *is* supported. This is possible because the pagecache
for an encrypted file contains the plaintext, not the ciphertext.
Without the key
---------------
Some filesystem operations may be performed on encrypted regular
files, directories, and symlinks even before their encryption key has
been provided:
- File metadata may be read, e.g. using stat().
- Directories may be listed, in which case the filenames will be
listed in an encoded form derived from their ciphertext. The
current encoding algorithm is described in `Filename hashing and
encoding`_. The algorithm is subject to change, but it is
guaranteed that the presented filenames will be no longer than
NAME_MAX bytes, will not contain the ``/`` or ``\0`` characters, and
will uniquely identify directory entries.
The ``.`` and ``..`` directory entries are special. They are always
present and are not encrypted or encoded.
- Files may be deleted. That is, nondirectory files may be deleted
with unlink() as usual, and empty directories may be deleted with
rmdir() as usual. Therefore, ``rm`` and ``rm -r`` will work as
expected.
- Symlink targets may be read and followed, but they will be presented
in encrypted form, similar to filenames in directories. Hence, they
are unlikely to point to anywhere useful.
Without the key, regular files cannot be opened or truncated.
Attempts to do so will fail with ENOKEY. This implies that any
regular file operations that require a file descriptor, such as
read(), write(), mmap(), fallocate(), and ioctl(), are also forbidden.
Also without the key, files of any type (including directories) cannot
be created or linked into an encrypted directory, nor can a name in an
encrypted directory be the source or target of a rename, nor can an
O_TMPFILE temporary file be created in an encrypted directory. All
such operations will fail with ENOKEY.
It is not currently possible to backup and restore encrypted files
without the encryption key. This would require special APIs which
have not yet been implemented.
Encryption policy enforcement
=============================
After an encryption policy has been set on a directory, all regular
files, directories, and symbolic links created in that directory
(recursively) will inherit that encryption policy. Special files ---
that is, named pipes, device nodes, and UNIX domain sockets --- will
not be encrypted.
Except for those special files, it is forbidden to have unencrypted
files, or files encrypted with a different encryption policy, in an
encrypted directory tree. Attempts to link or rename such a file into
an encrypted directory will fail with EPERM. This is also enforced
during ->lookup() to provide limited protection against offline
attacks that try to disable or downgrade encryption in known locations
where applications may later write sensitive data. It is recommended
that systems implementing a form of "verified boot" take advantage of
this by validating all top-level encryption policies prior to access.
Implementation details
======================
Encryption context
------------------
An encryption policy is represented on-disk by a :c:type:`struct
fscrypt_context`. It is up to individual filesystems to decide where
to store it, but normally it would be stored in a hidden extended
attribute. It should *not* be exposed by the xattr-related system
calls such as getxattr() and setxattr() because of the special
semantics of the encryption xattr. (In particular, there would be
much confusion if an encryption policy were to be added to or removed
from anything other than an empty directory.) The struct is defined
as follows::
#define FS_KEY_DESCRIPTOR_SIZE 8
#define FS_KEY_DERIVATION_NONCE_SIZE 16
struct fscrypt_context {
u8 format;
u8 contents_encryption_mode;
u8 filenames_encryption_mode;
u8 flags;
u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
};
Note that :c:type:`struct fscrypt_context` contains the same
information as :c:type:`struct fscrypt_policy` (see `Setting an
encryption policy`_), except that :c:type:`struct fscrypt_context`
also contains a nonce. The nonce is randomly generated by the kernel
and is used to derive the inode's encryption key as described in
`Per-file keys`_.
Data path changes
-----------------
For the read path (->readpage()) of regular files, filesystems can
read the ciphertext into the page cache and decrypt it in-place. The
page lock must be held until decryption has finished, to prevent the
page from becoming visible to userspace prematurely.
For the write path (->writepage()) of regular files, filesystems
cannot encrypt data in-place in the page cache, since the cached
plaintext must be preserved. Instead, filesystems must encrypt into a
temporary buffer or "bounce page", then write out the temporary
buffer. Some filesystems, such as UBIFS, already use temporary
buffers regardless of encryption. Other filesystems, such as ext4 and
F2FS, have to allocate bounce pages specially for encryption.
Filename hashing and encoding
-----------------------------
Modern filesystems accelerate directory lookups by using indexed
directories. An indexed directory is organized as a tree keyed by
filename hashes. When a ->lookup() is requested, the filesystem
normally hashes the filename being looked up so that it can quickly
find the corresponding directory entry, if any.
With encryption, lookups must be supported and efficient both with and
without the encryption key. Clearly, it would not work to hash the
plaintext filenames, since the plaintext filenames are unavailable
without the key. (Hashing the plaintext filenames would also make it
impossible for the filesystem's fsck tool to optimize encrypted
directories.) Instead, filesystems hash the ciphertext filenames,
i.e. the bytes actually stored on-disk in the directory entries. When
asked to do a ->lookup() with the key, the filesystem just encrypts
the user-supplied name to get the ciphertext.
Lookups without the key are more complicated. The raw ciphertext may
contain the ``\0`` and ``/`` characters, which are illegal in
filenames. Therefore, readdir() must base64-encode the ciphertext for
presentation. For most filenames, this works fine; on ->lookup(), the
filesystem just base64-decodes the user-supplied name to get back to
the raw ciphertext.
However, for very long filenames, base64 encoding would cause the
filename length to exceed NAME_MAX. To prevent this, readdir()
actually presents long filenames in an abbreviated form which encodes
a strong "hash" of the ciphertext filename, along with the optional
filesystem-specific hash(es) needed for directory lookups. This
allows the filesystem to still, with a high degree of confidence, map
the filename given in ->lookup() back to a particular directory entry
that was previously listed by readdir(). See :c:type:`struct
fscrypt_digested_name` in the source for more details.
Note that the precise way that filenames are presented to userspace
without the key is subject to change in the future. It is only meant
as a way to temporarily present valid filenames so that commands like
``rm -r`` work as expected on encrypted directories.

View file

@ -82,6 +82,29 @@ Only the lists of names from directories are merged. Other content
such as metadata and extended attributes are reported for the upper
directory only. These attributes of the lower directory are hidden.
credentials
-----------
By default, all access to the upper, lower and work directories is the
recorded mounter's MAC and DAC credentials. The incoming accesses are
checked against the caller's credentials.
In the case where caller MAC or DAC credentials do not overlap, a
use case available in older versions of the driver, the
override_creds mount flag can be turned off and help when the use
pattern has caller with legitimate credentials where the mounter
does not. Several unintended side effects will occur though. The
caller without certain key capabilities or lower privilege will not
always be able to delete files or directories, create nodes, or
search some restricted directories. The ability to search and read
a directory entry is spotty as a result of the cache mechanism not
retesting the credentials because of the assumption, a privileged
caller can fill cache, then a lower privilege can read the directory
cache. The uneven security model where cache, upperdir and workdir
are opened at privilege, but accessed without creating a form of
privilege escalation, should only be used with strict understanding
of the side effects and of the security policies.
whiteouts and opaque directories
--------------------------------

View file

@ -487,7 +487,9 @@ manner. The codes are the following:
Note that there is no guarantee that every flag and associated mnemonic will
be present in all further kernel releases. Things get changed, the flags may
be vanished or the reverse -- new added.
be vanished or the reverse -- new added. Interpretation of their meaning
might change in future as well. So each consumer of these flags has to
follow each specific kernel version for the exact semantic.
The "Name" field will only be present on a mapping that has been named by
userspace, and will show the name passed in by userspace.

View file

@ -160,7 +160,7 @@ them but you should handle them according to your needs.
UHID_OUTPUT:
This is sent if the HID device driver wants to send raw data to the I/O
device on the interrupt channel. You should read the payload and forward it to
the device. The payload is of type "struct uhid_data_req".
the device. The payload is of type "struct uhid_output_req".
This may be received even though you haven't received UHID_OPEN, yet.
UHID_GET_REPORT:

View file

@ -0,0 +1,15 @@
========================
Hardware vulnerabilities
========================
This section describes CPU vulnerabilities and provides an overview of the
possible mitigations along with guidance for selecting mitigations if they
are configurable at compile, boot or run time.
.. toctree::
:maxdepth: 1
l1tf
mds
tsx_async_abort
multihit.rst

View file

@ -0,0 +1,615 @@
L1TF - L1 Terminal Fault
========================
L1 Terminal Fault is a hardware vulnerability which allows unprivileged
speculative access to data which is available in the Level 1 Data Cache
when the page table entry controlling the virtual address, which is used
for the access, has the Present bit cleared or other reserved bits set.
Affected processors
-------------------
This vulnerability affects a wide range of Intel processors. The
vulnerability is not present on:
- Processors from AMD, Centaur and other non Intel vendors
- Older processor models, where the CPU family is < 6
- A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft,
Penwell, Pineview, Silvermont, Airmont, Merrifield)
- The Intel XEON PHI family
- Intel processors which have the ARCH_CAP_RDCL_NO bit set in the
IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected
by the Meltdown vulnerability either. These CPUs should become
available by end of 2018.
Whether a processor is affected or not can be read out from the L1TF
vulnerability file in sysfs. See :ref:`l1tf_sys_info`.
Related CVEs
------------
The following CVE entries are related to the L1TF vulnerability:
============= ================= ==============================
CVE-2018-3615 L1 Terminal Fault SGX related aspects
CVE-2018-3620 L1 Terminal Fault OS, SMM related aspects
CVE-2018-3646 L1 Terminal Fault Virtualization related aspects
============= ================= ==============================
Problem
-------
If an instruction accesses a virtual address for which the relevant page
table entry (PTE) has the Present bit cleared or other reserved bits set,
then speculative execution ignores the invalid PTE and loads the referenced
data if it is present in the Level 1 Data Cache, as if the page referenced
by the address bits in the PTE was still present and accessible.
While this is a purely speculative mechanism and the instruction will raise
a page fault when it is retired eventually, the pure act of loading the
data and making it available to other speculative instructions opens up the
opportunity for side channel attacks to unprivileged malicious code,
similar to the Meltdown attack.
While Meltdown breaks the user space to kernel space protection, L1TF
allows to attack any physical memory address in the system and the attack
works across all protection domains. It allows an attack of SGX and also
works from inside virtual machines because the speculation bypasses the
extended page table (EPT) protection mechanism.
Attack scenarios
----------------
1. Malicious user space
^^^^^^^^^^^^^^^^^^^^^^^
Operating Systems store arbitrary information in the address bits of a
PTE which is marked non present. This allows a malicious user space
application to attack the physical memory to which these PTEs resolve.
In some cases user-space can maliciously influence the information
encoded in the address bits of the PTE, thus making attacks more
deterministic and more practical.
The Linux kernel contains a mitigation for this attack vector, PTE
inversion, which is permanently enabled and has no performance
impact. The kernel ensures that the address bits of PTEs, which are not
marked present, never point to cacheable physical memory space.
A system with an up to date kernel is protected against attacks from
malicious user space applications.
2. Malicious guest in a virtual machine
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The fact that L1TF breaks all domain protections allows malicious guest
OSes, which can control the PTEs directly, and malicious guest user
space applications, which run on an unprotected guest kernel lacking the
PTE inversion mitigation for L1TF, to attack physical host memory.
A special aspect of L1TF in the context of virtualization is symmetric
multi threading (SMT). The Intel implementation of SMT is called
HyperThreading. The fact that Hyperthreads on the affected processors
share the L1 Data Cache (L1D) is important for this. As the flaw allows
only to attack data which is present in L1D, a malicious guest running
on one Hyperthread can attack the data which is brought into the L1D by
the context which runs on the sibling Hyperthread of the same physical
core. This context can be host OS, host user space or a different guest.
If the processor does not support Extended Page Tables, the attack is
only possible, when the hypervisor does not sanitize the content of the
effective (shadow) page tables.
While solutions exist to mitigate these attack vectors fully, these
mitigations are not enabled by default in the Linux kernel because they
can affect performance significantly. The kernel provides several
mechanisms which can be utilized to address the problem depending on the
deployment scenario. The mitigations, their protection scope and impact
are described in the next sections.
The default mitigations and the rationale for choosing them are explained
at the end of this document. See :ref:`default_mitigations`.
.. _l1tf_sys_info:
L1TF system information
-----------------------
The Linux kernel provides a sysfs interface to enumerate the current L1TF
status of the system: whether the system is vulnerable, and which
mitigations are active. The relevant sysfs file is:
/sys/devices/system/cpu/vulnerabilities/l1tf
The possible values in this file are:
=========================== ===============================
'Not affected' The processor is not vulnerable
'Mitigation: PTE Inversion' The host protection is active
=========================== ===============================
If KVM/VMX is enabled and the processor is vulnerable then the following
information is appended to the 'Mitigation: PTE Inversion' part:
- SMT status:
===================== ================
'VMX: SMT vulnerable' SMT is enabled
'VMX: SMT disabled' SMT is disabled
===================== ================
- L1D Flush mode:
================================ ====================================
'L1D vulnerable' L1D flushing is disabled
'L1D conditional cache flushes' L1D flush is conditionally enabled
'L1D cache flushes' L1D flush is unconditionally enabled
================================ ====================================
The resulting grade of protection is discussed in the following sections.
Host mitigation mechanism
-------------------------
The kernel is unconditionally protected against L1TF attacks from malicious
user space running on the host.
Guest mitigation mechanisms
---------------------------
.. _l1d_flush:
1. L1D flush on VMENTER
^^^^^^^^^^^^^^^^^^^^^^^
To make sure that a guest cannot attack data which is present in the L1D
the hypervisor flushes the L1D before entering the guest.
Flushing the L1D evicts not only the data which should not be accessed
by a potentially malicious guest, it also flushes the guest
data. Flushing the L1D has a performance impact as the processor has to
bring the flushed guest data back into the L1D. Depending on the
frequency of VMEXIT/VMENTER and the type of computations in the guest
performance degradation in the range of 1% to 50% has been observed. For
scenarios where guest VMEXIT/VMENTER are rare the performance impact is
minimal. Virtio and mechanisms like posted interrupts are designed to
confine the VMEXITs to a bare minimum, but specific configurations and
application scenarios might still suffer from a high VMEXIT rate.
The kernel provides two L1D flush modes:
- conditional ('cond')
- unconditional ('always')
The conditional mode avoids L1D flushing after VMEXITs which execute
only audited code paths before the corresponding VMENTER. These code
paths have been verified that they cannot expose secrets or other
interesting data to an attacker, but they can leak information about the
address space layout of the hypervisor.
Unconditional mode flushes L1D on all VMENTER invocations and provides
maximum protection. It has a higher overhead than the conditional
mode. The overhead cannot be quantified correctly as it depends on the
workload scenario and the resulting number of VMEXITs.
The general recommendation is to enable L1D flush on VMENTER. The kernel
defaults to conditional mode on affected processors.
**Note**, that L1D flush does not prevent the SMT problem because the
sibling thread will also bring back its data into the L1D which makes it
attackable again.
L1D flush can be controlled by the administrator via the kernel command
line and sysfs control files. See :ref:`mitigation_control_command_line`
and :ref:`mitigation_control_kvm`.
.. _guest_confinement:
2. Guest VCPU confinement to dedicated physical cores
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
To address the SMT problem, it is possible to make a guest or a group of
guests affine to one or more physical cores. The proper mechanism for
that is to utilize exclusive cpusets to ensure that no other guest or
host tasks can run on these cores.
If only a single guest or related guests run on sibling SMT threads on
the same physical core then they can only attack their own memory and
restricted parts of the host memory.
Host memory is attackable, when one of the sibling SMT threads runs in
host OS (hypervisor) context and the other in guest context. The amount
of valuable information from the host OS context depends on the context
which the host OS executes, i.e. interrupts, soft interrupts and kernel
threads. The amount of valuable data from these contexts cannot be
declared as non-interesting for an attacker without deep inspection of
the code.
**Note**, that assigning guests to a fixed set of physical cores affects
the ability of the scheduler to do load balancing and might have
negative effects on CPU utilization depending on the hosting
scenario. Disabling SMT might be a viable alternative for particular
scenarios.
For further information about confining guests to a single or to a group
of cores consult the cpusets documentation:
https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt
.. _interrupt_isolation:
3. Interrupt affinity
^^^^^^^^^^^^^^^^^^^^^
Interrupts can be made affine to logical CPUs. This is not universally
true because there are types of interrupts which are truly per CPU
interrupts, e.g. the local timer interrupt. Aside of that multi queue
devices affine their interrupts to single CPUs or groups of CPUs per
queue without allowing the administrator to control the affinities.
Moving the interrupts, which can be affinity controlled, away from CPUs
which run untrusted guests, reduces the attack vector space.
Whether the interrupts with are affine to CPUs, which run untrusted
guests, provide interesting data for an attacker depends on the system
configuration and the scenarios which run on the system. While for some
of the interrupts it can be assumed that they won't expose interesting
information beyond exposing hints about the host OS memory layout, there
is no way to make general assumptions.
Interrupt affinity can be controlled by the administrator via the
/proc/irq/$NR/smp_affinity[_list] files. Limited documentation is
available at:
https://www.kernel.org/doc/Documentation/IRQ-affinity.txt
.. _smt_control:
4. SMT control
^^^^^^^^^^^^^^
To prevent the SMT issues of L1TF it might be necessary to disable SMT
completely. Disabling SMT can have a significant performance impact, but
the impact depends on the hosting scenario and the type of workloads.
The impact of disabling SMT needs also to be weighted against the impact
of other mitigation solutions like confining guests to dedicated cores.
The kernel provides a sysfs interface to retrieve the status of SMT and
to control it. It also provides a kernel command line interface to
control SMT.
The kernel command line interface consists of the following options:
=========== ==========================================================
nosmt Affects the bring up of the secondary CPUs during boot. The
kernel tries to bring all present CPUs online during the
boot process. "nosmt" makes sure that from each physical
core only one - the so called primary (hyper) thread is
activated. Due to a design flaw of Intel processors related
to Machine Check Exceptions the non primary siblings have
to be brought up at least partially and are then shut down
again. "nosmt" can be undone via the sysfs interface.
nosmt=force Has the same effect as "nosmt" but it does not allow to
undo the SMT disable via the sysfs interface.
=========== ==========================================================
The sysfs interface provides two files:
- /sys/devices/system/cpu/smt/control
- /sys/devices/system/cpu/smt/active
/sys/devices/system/cpu/smt/control:
This file allows to read out the SMT control state and provides the
ability to disable or (re)enable SMT. The possible states are:
============== ===================================================
on SMT is supported by the CPU and enabled. All
logical CPUs can be onlined and offlined without
restrictions.
off SMT is supported by the CPU and disabled. Only
the so called primary SMT threads can be onlined
and offlined without restrictions. An attempt to
online a non-primary sibling is rejected
forceoff Same as 'off' but the state cannot be controlled.
Attempts to write to the control file are rejected.
notsupported The processor does not support SMT. It's therefore
not affected by the SMT implications of L1TF.
Attempts to write to the control file are rejected.
============== ===================================================
The possible states which can be written into this file to control SMT
state are:
- on
- off
- forceoff
/sys/devices/system/cpu/smt/active:
This file reports whether SMT is enabled and active, i.e. if on any
physical core two or more sibling threads are online.
SMT control is also possible at boot time via the l1tf kernel command
line parameter in combination with L1D flush control. See
:ref:`mitigation_control_command_line`.
5. Disabling EPT
^^^^^^^^^^^^^^^^
Disabling EPT for virtual machines provides full mitigation for L1TF even
with SMT enabled, because the effective page tables for guests are
managed and sanitized by the hypervisor. Though disabling EPT has a
significant performance impact especially when the Meltdown mitigation
KPTI is enabled.
EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
There is ongoing research and development for new mitigation mechanisms to
address the performance impact of disabling SMT or EPT.
.. _mitigation_control_command_line:
Mitigation control on the kernel command line
---------------------------------------------
The kernel command line allows to control the L1TF mitigations at boot
time with the option "l1tf=". The valid arguments for this option are:
============ =============================================================
full Provides all available mitigations for the L1TF
vulnerability. Disables SMT and enables all mitigations in
the hypervisors, i.e. unconditional L1D flushing
SMT control and L1D flush control via the sysfs interface
is still possible after boot. Hypervisors will issue a
warning when the first VM is started in a potentially
insecure configuration, i.e. SMT enabled or L1D flush
disabled.
full,force Same as 'full', but disables SMT and L1D flush runtime
control. Implies the 'nosmt=force' command line option.
(i.e. sysfs control of SMT is disabled.)
flush Leaves SMT enabled and enables the default hypervisor
mitigation, i.e. conditional L1D flushing
SMT control and L1D flush control via the sysfs interface
is still possible after boot. Hypervisors will issue a
warning when the first VM is started in a potentially
insecure configuration, i.e. SMT enabled or L1D flush
disabled.
flush,nosmt Disables SMT and enables the default hypervisor mitigation,
i.e. conditional L1D flushing.
SMT control and L1D flush control via the sysfs interface
is still possible after boot. Hypervisors will issue a
warning when the first VM is started in a potentially
insecure configuration, i.e. SMT enabled or L1D flush
disabled.
flush,nowarn Same as 'flush', but hypervisors will not warn when a VM is
started in a potentially insecure configuration.
off Disables hypervisor mitigations and doesn't emit any
warnings.
It also drops the swap size and available RAM limit restrictions
on both hypervisor and bare metal.
============ =============================================================
The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`.
.. _mitigation_control_kvm:
Mitigation control for KVM - module parameter
-------------------------------------------------------------
The KVM hypervisor mitigation mechanism, flushing the L1D cache when
entering a guest, can be controlled with a module parameter.
The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the
following arguments:
============ ==============================================================
always L1D cache flush on every VMENTER.
cond Flush L1D on VMENTER only when the code between VMEXIT and
VMENTER can leak host memory which is considered
interesting for an attacker. This still can leak host memory
which allows e.g. to determine the hosts address space layout.
never Disables the mitigation
============ ==============================================================
The parameter can be provided on the kernel command line, as a module
parameter when loading the modules and at runtime modified via the sysfs
file:
/sys/module/kvm_intel/parameters/vmentry_l1d_flush
The default is 'cond'. If 'l1tf=full,force' is given on the kernel command
line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
module parameter is ignored and writes to the sysfs file are rejected.
.. _mitigation_selection:
Mitigation selection guide
--------------------------
1. No virtualization in use
^^^^^^^^^^^^^^^^^^^^^^^^^^^
The system is protected by the kernel unconditionally and no further
action is required.
2. Virtualization with trusted guests
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
If the guest comes from a trusted source and the guest OS kernel is
guaranteed to have the L1TF mitigations in place the system is fully
protected against L1TF and no further action is required.
To avoid the overhead of the default L1D flushing on VMENTER the
administrator can disable the flushing via the kernel command line and
sysfs control files. See :ref:`mitigation_control_command_line` and
:ref:`mitigation_control_kvm`.
3. Virtualization with untrusted guests
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
3.1. SMT not supported or disabled
""""""""""""""""""""""""""""""""""
If SMT is not supported by the processor or disabled in the BIOS or by
the kernel, it's only required to enforce L1D flushing on VMENTER.
Conditional L1D flushing is the default behaviour and can be tuned. See
:ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
3.2. EPT not supported or disabled
""""""""""""""""""""""""""""""""""
If EPT is not supported by the processor or disabled in the hypervisor,
the system is fully protected. SMT can stay enabled and L1D flushing on
VMENTER is not required.
EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
3.3. SMT and EPT supported and active
"""""""""""""""""""""""""""""""""""""
If SMT and EPT are supported and active then various degrees of
mitigations can be employed:
- L1D flushing on VMENTER:
L1D flushing on VMENTER is the minimal protection requirement, but it
is only potent in combination with other mitigation methods.
Conditional L1D flushing is the default behaviour and can be tuned. See
:ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
- Guest confinement:
Confinement of guests to a single or a group of physical cores which
are not running any other processes, can reduce the attack surface
significantly, but interrupts, soft interrupts and kernel threads can
still expose valuable data to a potential attacker. See
:ref:`guest_confinement`.
- Interrupt isolation:
Isolating the guest CPUs from interrupts can reduce the attack surface
further, but still allows a malicious guest to explore a limited amount
of host physical memory. This can at least be used to gain knowledge
about the host address space layout. The interrupts which have a fixed
affinity to the CPUs which run the untrusted guests can depending on
the scenario still trigger soft interrupts and schedule kernel threads
which might expose valuable information. See
:ref:`interrupt_isolation`.
The above three mitigation methods combined can provide protection to a
certain degree, but the risk of the remaining attack surface has to be
carefully analyzed. For full protection the following methods are
available:
- Disabling SMT:
Disabling SMT and enforcing the L1D flushing provides the maximum
amount of protection. This mitigation is not depending on any of the
above mitigation methods.
SMT control and L1D flushing can be tuned by the command line
parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run
time with the matching sysfs control files. See :ref:`smt_control`,
:ref:`mitigation_control_command_line` and
:ref:`mitigation_control_kvm`.
- Disabling EPT:
Disabling EPT provides the maximum amount of protection as well. It is
not depending on any of the above mitigation methods. SMT can stay
enabled and L1D flushing is not required, but the performance impact is
significant.
EPT can be disabled in the hypervisor via the 'kvm-intel.ept'
parameter.
3.4. Nested virtual machines
""""""""""""""""""""""""""""
When nested virtualization is in use, three operating systems are involved:
the bare metal hypervisor, the nested hypervisor and the nested virtual
machine. VMENTER operations from the nested hypervisor into the nested
guest will always be processed by the bare metal hypervisor. If KVM is the
bare metal hypervisor it will:
- Flush the L1D cache on every switch from the nested hypervisor to the
nested virtual machine, so that the nested hypervisor's secrets are not
exposed to the nested virtual machine;
- Flush the L1D cache on every switch from the nested virtual machine to
the nested hypervisor; this is a complex operation, and flushing the L1D
cache avoids that the bare metal hypervisor's secrets are exposed to the
nested virtual machine;
- Instruct the nested hypervisor to not perform any L1D cache flush. This
is an optimization to avoid double L1D flushing.
.. _default_mitigations:
Default mitigations
-------------------
The kernel default mitigations for vulnerable processors are:
- PTE inversion to protect against malicious user space. This is done
unconditionally and cannot be controlled. The swap storage is limited
to ~16TB.
- L1D conditional flushing on VMENTER when EPT is enabled for
a guest.
The kernel does not by default enforce the disabling of SMT, which leaves
SMT systems vulnerable when running untrusted guests with EPT enabled.
The rationale for this choice is:
- Force disabling SMT can break existing setups, especially with
unattended updates.
- If regular users run untrusted guests on their machine, then L1TF is
just an add on to other malware which might be embedded in an untrusted
guest, e.g. spam-bots or attacks on the local network.
There is no technical way to prevent a user from running untrusted code
on their machines blindly.
- It's technically extremely unlikely and from today's knowledge even
impossible that L1TF can be exploited via the most popular attack
mechanisms like JavaScript because these mechanisms have no way to
control PTEs. If this would be possible and not other mitigation would
be possible, then the default might be different.
- The administrators of cloud and hosting setups have to carefully
analyze the risk for their scenarios and make the appropriate
mitigation choices, which might even vary across their deployed
machines and also result in other changes of their overall setup.
There is no way for the kernel to provide a sensible default for this
kind of scenarios.

View file

@ -0,0 +1,311 @@
MDS - Microarchitectural Data Sampling
======================================
Microarchitectural Data Sampling is a hardware vulnerability which allows
unprivileged speculative access to data which is available in various CPU
internal buffers.
Affected processors
-------------------
This vulnerability affects a wide range of Intel processors. The
vulnerability is not present on:
- Processors from AMD, Centaur and other non Intel vendors
- Older processor models, where the CPU family is < 6
- Some Atoms (Bonnell, Saltwell, Goldmont, GoldmontPlus)
- Intel processors which have the ARCH_CAP_MDS_NO bit set in the
IA32_ARCH_CAPABILITIES MSR.
Whether a processor is affected or not can be read out from the MDS
vulnerability file in sysfs. See :ref:`mds_sys_info`.
Not all processors are affected by all variants of MDS, but the mitigation
is identical for all of them so the kernel treats them as a single
vulnerability.
Related CVEs
------------
The following CVE entries are related to the MDS vulnerability:
============== ===== ===================================================
CVE-2018-12126 MSBDS Microarchitectural Store Buffer Data Sampling
CVE-2018-12130 MFBDS Microarchitectural Fill Buffer Data Sampling
CVE-2018-12127 MLPDS Microarchitectural Load Port Data Sampling
CVE-2019-11091 MDSUM Microarchitectural Data Sampling Uncacheable Memory
============== ===== ===================================================
Problem
-------
When performing store, load, L1 refill operations, processors write data
into temporary microarchitectural structures (buffers). The data in the
buffer can be forwarded to load operations as an optimization.
Under certain conditions, usually a fault/assist caused by a load
operation, data unrelated to the load memory address can be speculatively
forwarded from the buffers. Because the load operation causes a fault or
assist and its result will be discarded, the forwarded data will not cause
incorrect program execution or state changes. But a malicious operation
may be able to forward this speculative data to a disclosure gadget which
allows in turn to infer the value via a cache side channel attack.
Because the buffers are potentially shared between Hyper-Threads cross
Hyper-Thread attacks are possible.
Deeper technical information is available in the MDS specific x86
architecture section: :ref:`Documentation/x86/mds.rst <mds>`.
Attack scenarios
----------------
Attacks against the MDS vulnerabilities can be mounted from malicious non
priviledged user space applications running on hosts or guest. Malicious
guest OSes can obviously mount attacks as well.
Contrary to other speculation based vulnerabilities the MDS vulnerability
does not allow the attacker to control the memory target address. As a
consequence the attacks are purely sampling based, but as demonstrated with
the TLBleed attack samples can be postprocessed successfully.
Web-Browsers
^^^^^^^^^^^^
It's unclear whether attacks through Web-Browsers are possible at
all. The exploitation through Java-Script is considered very unlikely,
but other widely used web technologies like Webassembly could possibly be
abused.
.. _mds_sys_info:
MDS system information
-----------------------
The Linux kernel provides a sysfs interface to enumerate the current MDS
status of the system: whether the system is vulnerable, and which
mitigations are active. The relevant sysfs file is:
/sys/devices/system/cpu/vulnerabilities/mds
The possible values in this file are:
.. list-table::
* - 'Not affected'
- The processor is not vulnerable
* - 'Vulnerable'
- The processor is vulnerable, but no mitigation enabled
* - 'Vulnerable: Clear CPU buffers attempted, no microcode'
- The processor is vulnerable but microcode is not updated.
The mitigation is enabled on a best effort basis. See :ref:`vmwerv`
* - 'Mitigation: Clear CPU buffers'
- The processor is vulnerable and the CPU buffer clearing mitigation is
enabled.
If the processor is vulnerable then the following information is appended
to the above information:
======================== ============================================
'SMT vulnerable' SMT is enabled
'SMT mitigated' SMT is enabled and mitigated
'SMT disabled' SMT is disabled
'SMT Host state unknown' Kernel runs in a VM, Host SMT state unknown
======================== ============================================
.. _vmwerv:
Best effort mitigation mode
^^^^^^^^^^^^^^^^^^^^^^^^^^^
If the processor is vulnerable, but the availability of the microcode based
mitigation mechanism is not advertised via CPUID the kernel selects a best
effort mitigation mode. This mode invokes the mitigation instructions
without a guarantee that they clear the CPU buffers.
This is done to address virtualization scenarios where the host has the
microcode update applied, but the hypervisor is not yet updated to expose
the CPUID to the guest. If the host has updated microcode the protection
takes effect otherwise a few cpu cycles are wasted pointlessly.
The state in the mds sysfs file reflects this situation accordingly.
Mitigation mechanism
-------------------------
The kernel detects the affected CPUs and the presence of the microcode
which is required.
If a CPU is affected and the microcode is available, then the kernel
enables the mitigation by default. The mitigation can be controlled at boot
time via a kernel command line option. See
:ref:`mds_mitigation_control_command_line`.
.. _cpu_buffer_clear:
CPU buffer clearing
^^^^^^^^^^^^^^^^^^^
The mitigation for MDS clears the affected CPU buffers on return to user
space and when entering a guest.
If SMT is enabled it also clears the buffers on idle entry when the CPU
is only affected by MSBDS and not any other MDS variant, because the
other variants cannot be protected against cross Hyper-Thread attacks.
For CPUs which are only affected by MSBDS the user space, guest and idle
transition mitigations are sufficient and SMT is not affected.
.. _virt_mechanism:
Virtualization mitigation
^^^^^^^^^^^^^^^^^^^^^^^^^
The protection for host to guest transition depends on the L1TF
vulnerability of the CPU:
- CPU is affected by L1TF:
If the L1D flush mitigation is enabled and up to date microcode is
available, the L1D flush mitigation is automatically protecting the
guest transition.
If the L1D flush mitigation is disabled then the MDS mitigation is
invoked explicit when the host MDS mitigation is enabled.
For details on L1TF and virtualization see:
:ref:`Documentation/hw-vuln//l1tf.rst <mitigation_control_kvm>`.
- CPU is not affected by L1TF:
CPU buffers are flushed before entering the guest when the host MDS
mitigation is enabled.
The resulting MDS protection matrix for the host to guest transition:
============ ===== ============= ============ =================
L1TF MDS VMX-L1FLUSH Host MDS MDS-State
Don't care No Don't care N/A Not affected
Yes Yes Disabled Off Vulnerable
Yes Yes Disabled Full Mitigated
Yes Yes Enabled Don't care Mitigated
No Yes N/A Off Vulnerable
No Yes N/A Full Mitigated
============ ===== ============= ============ =================
This only covers the host to guest transition, i.e. prevents leakage from
host to guest, but does not protect the guest internally. Guests need to
have their own protections.
.. _xeon_phi:
XEON PHI specific considerations
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The XEON PHI processor family is affected by MSBDS which can be exploited
cross Hyper-Threads when entering idle states. Some XEON PHI variants allow
to use MWAIT in user space (Ring 3) which opens an potential attack vector
for malicious user space. The exposure can be disabled on the kernel
command line with the 'ring3mwait=disable' command line option.
XEON PHI is not affected by the other MDS variants and MSBDS is mitigated
before the CPU enters a idle state. As XEON PHI is not affected by L1TF
either disabling SMT is not required for full protection.
.. _mds_smt_control:
SMT control
^^^^^^^^^^^
All MDS variants except MSBDS can be attacked cross Hyper-Threads. That
means on CPUs which are affected by MFBDS or MLPDS it is necessary to
disable SMT for full protection. These are most of the affected CPUs; the
exception is XEON PHI, see :ref:`xeon_phi`.
Disabling SMT can have a significant performance impact, but the impact
depends on the type of workloads.
See the relevant chapter in the L1TF mitigation documentation for details:
:ref:`Documentation/hw-vuln/l1tf.rst <smt_control>`.
.. _mds_mitigation_control_command_line:
Mitigation control on the kernel command line
---------------------------------------------
The kernel command line allows to control the MDS mitigations at boot
time with the option "mds=". The valid arguments for this option are:
============ =============================================================
full If the CPU is vulnerable, enable all available mitigations
for the MDS vulnerability, CPU buffer clearing on exit to
userspace and when entering a VM. Idle transitions are
protected as well if SMT is enabled.
It does not automatically disable SMT.
full,nosmt The same as mds=full, with SMT disabled on vulnerable
CPUs. This is the complete mitigation.
off Disables MDS mitigations completely.
============ =============================================================
Not specifying this option is equivalent to "mds=full". For processors
that are affected by both TAA (TSX Asynchronous Abort) and MDS,
specifying just "mds=off" without an accompanying "tsx_async_abort=off"
will have no effect as the same mitigation is used for both
vulnerabilities.
Mitigation selection guide
--------------------------
1. Trusted userspace
^^^^^^^^^^^^^^^^^^^^
If all userspace applications are from a trusted source and do not
execute untrusted code which is supplied externally, then the mitigation
can be disabled.
2. Virtualization with trusted guests
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The same considerations as above versus trusted user space apply.
3. Virtualization with untrusted guests
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The protection depends on the state of the L1TF mitigations.
See :ref:`virt_mechanism`.
If the MDS mitigation is enabled and SMT is disabled, guest to host and
guest to guest attacks are prevented.
.. _mds_default_mitigations:
Default mitigations
-------------------
The kernel default mitigations for vulnerable processors are:
- Enable CPU buffer clearing
The kernel does not by default enforce the disabling of SMT, which leaves
SMT systems vulnerable when running untrusted code. The same rationale as
for L1TF applies.
See :ref:`Documentation/hw-vuln//l1tf.rst <default_mitigations>`.

View file

@ -0,0 +1,163 @@
iTLB multihit
=============
iTLB multihit is an erratum where some processors may incur a machine check
error, possibly resulting in an unrecoverable CPU lockup, when an
instruction fetch hits multiple entries in the instruction TLB. This can
occur when the page size is changed along with either the physical address
or cache type. A malicious guest running on a virtualized system can
exploit this erratum to perform a denial of service attack.
Affected processors
-------------------
Variations of this erratum are present on most Intel Core and Xeon processor
models. The erratum is not present on:
- non-Intel processors
- Some Atoms (Airmont, Bonnell, Goldmont, GoldmontPlus, Saltwell, Silvermont)
- Intel processors that have the PSCHANGE_MC_NO bit set in the
IA32_ARCH_CAPABILITIES MSR.
Related CVEs
------------
The following CVE entry is related to this issue:
============== =================================================
CVE-2018-12207 Machine Check Error Avoidance on Page Size Change
============== =================================================
Problem
-------
Privileged software, including OS and virtual machine managers (VMM), are in
charge of memory management. A key component in memory management is the control
of the page tables. Modern processors use virtual memory, a technique that creates
the illusion of a very large memory for processors. This virtual space is split
into pages of a given size. Page tables translate virtual addresses to physical
addresses.
To reduce latency when performing a virtual to physical address translation,
processors include a structure, called TLB, that caches recent translations.
There are separate TLBs for instruction (iTLB) and data (dTLB).
Under this errata, instructions are fetched from a linear address translated
using a 4 KB translation cached in the iTLB. Privileged software modifies the
paging structure so that the same linear address using large page size (2 MB, 4
MB, 1 GB) with a different physical address or memory type. After the page
structure modification but before the software invalidates any iTLB entries for
the linear address, a code fetch that happens on the same linear address may
cause a machine-check error which can result in a system hang or shutdown.
Attack scenarios
----------------
Attacks against the iTLB multihit erratum can be mounted from malicious
guests in a virtualized system.
iTLB multihit system information
--------------------------------
The Linux kernel provides a sysfs interface to enumerate the current iTLB
multihit status of the system:whether the system is vulnerable and which
mitigations are active. The relevant sysfs file is:
/sys/devices/system/cpu/vulnerabilities/itlb_multihit
The possible values in this file are:
.. list-table::
* - Not affected
- The processor is not vulnerable.
* - KVM: Mitigation: Split huge pages
- Software changes mitigate this issue.
* - KVM: Vulnerable
- The processor is vulnerable, but no mitigation enabled
Enumeration of the erratum
--------------------------------
A new bit has been allocated in the IA32_ARCH_CAPABILITIES (PSCHANGE_MC_NO) msr
and will be set on CPU's which are mitigated against this issue.
======================================= =========== ===============================
IA32_ARCH_CAPABILITIES MSR Not present Possibly vulnerable,check model
IA32_ARCH_CAPABILITIES[PSCHANGE_MC_NO] '0' Likely vulnerable,check model
IA32_ARCH_CAPABILITIES[PSCHANGE_MC_NO] '1' Not vulnerable
======================================= =========== ===============================
Mitigation mechanism
-------------------------
This erratum can be mitigated by restricting the use of large page sizes to
non-executable pages. This forces all iTLB entries to be 4K, and removes
the possibility of multiple hits.
In order to mitigate the vulnerability, KVM initially marks all huge pages
as non-executable. If the guest attempts to execute in one of those pages,
the page is broken down into 4K pages, which are then marked executable.
If EPT is disabled or not available on the host, KVM is in control of TLB
flushes and the problematic situation cannot happen. However, the shadow
EPT paging mechanism used by nested virtualization is vulnerable, because
the nested guest can trigger multiple iTLB hits by modifying its own
(non-nested) page tables. For simplicity, KVM will make large pages
non-executable in all shadow paging modes.
Mitigation control on the kernel command line and KVM - module parameter
------------------------------------------------------------------------
The KVM hypervisor mitigation mechanism for marking huge pages as
non-executable can be controlled with a module parameter "nx_huge_pages=".
The kernel command line allows to control the iTLB multihit mitigations at
boot time with the option "kvm.nx_huge_pages=".
The valid arguments for these options are:
========== ================================================================
force Mitigation is enabled. In this case, the mitigation implements
non-executable huge pages in Linux kernel KVM module. All huge
pages in the EPT are marked as non-executable.
If a guest attempts to execute in one of those pages, the page is
broken down into 4K pages, which are then marked executable.
off Mitigation is disabled.
auto Enable mitigation only if the platform is affected and the kernel
was not booted with the "mitigations=off" command line parameter.
This is the default option.
========== ================================================================
Mitigation selection guide
--------------------------
1. No virtualization in use
^^^^^^^^^^^^^^^^^^^^^^^^^^^
The system is protected by the kernel unconditionally and no further
action is required.
2. Virtualization with trusted guests
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
If the guest comes from a trusted source, you may assume that the guest will
not attempt to maliciously exploit these errata and no further action is
required.
3. Virtualization with untrusted guests
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
If the guest comes from an untrusted source, the guest host kernel will need
to apply iTLB multihit mitigation via the kernel command line or kvm
module parameter.

View file

@ -0,0 +1,279 @@
.. SPDX-License-Identifier: GPL-2.0
TAA - TSX Asynchronous Abort
======================================
TAA is a hardware vulnerability that allows unprivileged speculative access to
data which is available in various CPU internal buffers by using asynchronous
aborts within an Intel TSX transactional region.
Affected processors
-------------------
This vulnerability only affects Intel processors that support Intel
Transactional Synchronization Extensions (TSX) when the TAA_NO bit (bit 8)
is 0 in the IA32_ARCH_CAPABILITIES MSR. On processors where the MDS_NO bit
(bit 5) is 0 in the IA32_ARCH_CAPABILITIES MSR, the existing MDS mitigations
also mitigate against TAA.
Whether a processor is affected or not can be read out from the TAA
vulnerability file in sysfs. See :ref:`tsx_async_abort_sys_info`.
Related CVEs
------------
The following CVE entry is related to this TAA issue:
============== ===== ===================================================
CVE-2019-11135 TAA TSX Asynchronous Abort (TAA) condition on some
microprocessors utilizing speculative execution may
allow an authenticated user to potentially enable
information disclosure via a side channel with
local access.
============== ===== ===================================================
Problem
-------
When performing store, load or L1 refill operations, processors write
data into temporary microarchitectural structures (buffers). The data in
those buffers can be forwarded to load operations as an optimization.
Intel TSX is an extension to the x86 instruction set architecture that adds
hardware transactional memory support to improve performance of multi-threaded
software. TSX lets the processor expose and exploit concurrency hidden in an
application due to dynamically avoiding unnecessary synchronization.
TSX supports atomic memory transactions that are either committed (success) or
aborted. During an abort, operations that happened within the transactional region
are rolled back. An asynchronous abort takes place, among other options, when a
different thread accesses a cache line that is also used within the transactional
region when that access might lead to a data race.
Immediately after an uncompleted asynchronous abort, certain speculatively
executed loads may read data from those internal buffers and pass it to dependent
operations. This can be then used to infer the value via a cache side channel
attack.
Because the buffers are potentially shared between Hyper-Threads cross
Hyper-Thread attacks are possible.
The victim of a malicious actor does not need to make use of TSX. Only the
attacker needs to begin a TSX transaction and raise an asynchronous abort
which in turn potenitally leaks data stored in the buffers.
More detailed technical information is available in the TAA specific x86
architecture section: :ref:`Documentation/x86/tsx_async_abort.rst <tsx_async_abort>`.
Attack scenarios
----------------
Attacks against the TAA vulnerability can be implemented from unprivileged
applications running on hosts or guests.
As for MDS, the attacker has no control over the memory addresses that can
be leaked. Only the victim is responsible for bringing data to the CPU. As
a result, the malicious actor has to sample as much data as possible and
then postprocess it to try to infer any useful information from it.
A potential attacker only has read access to the data. Also, there is no direct
privilege escalation by using this technique.
.. _tsx_async_abort_sys_info:
TAA system information
-----------------------
The Linux kernel provides a sysfs interface to enumerate the current TAA status
of mitigated systems. The relevant sysfs file is:
/sys/devices/system/cpu/vulnerabilities/tsx_async_abort
The possible values in this file are:
.. list-table::
* - 'Vulnerable'
- The CPU is affected by this vulnerability and the microcode and kernel mitigation are not applied.
* - 'Vulnerable: Clear CPU buffers attempted, no microcode'
- The system tries to clear the buffers but the microcode might not support the operation.
* - 'Mitigation: Clear CPU buffers'
- The microcode has been updated to clear the buffers. TSX is still enabled.
* - 'Mitigation: TSX disabled'
- TSX is disabled.
* - 'Not affected'
- The CPU is not affected by this issue.
.. _ucode_needed:
Best effort mitigation mode
^^^^^^^^^^^^^^^^^^^^^^^^^^^
If the processor is vulnerable, but the availability of the microcode-based
mitigation mechanism is not advertised via CPUID the kernel selects a best
effort mitigation mode. This mode invokes the mitigation instructions
without a guarantee that they clear the CPU buffers.
This is done to address virtualization scenarios where the host has the
microcode update applied, but the hypervisor is not yet updated to expose the
CPUID to the guest. If the host has updated microcode the protection takes
effect; otherwise a few CPU cycles are wasted pointlessly.
The state in the tsx_async_abort sysfs file reflects this situation
accordingly.
Mitigation mechanism
--------------------
The kernel detects the affected CPUs and the presence of the microcode which is
required. If a CPU is affected and the microcode is available, then the kernel
enables the mitigation by default.
The mitigation can be controlled at boot time via a kernel command line option.
See :ref:`taa_mitigation_control_command_line`.
.. _virt_mechanism:
Virtualization mitigation
^^^^^^^^^^^^^^^^^^^^^^^^^
Affected systems where the host has TAA microcode and TAA is mitigated by
having disabled TSX previously, are not vulnerable regardless of the status
of the VMs.
In all other cases, if the host either does not have the TAA microcode or
the kernel is not mitigated, the system might be vulnerable.
.. _taa_mitigation_control_command_line:
Mitigation control on the kernel command line
---------------------------------------------
The kernel command line allows to control the TAA mitigations at boot time with
the option "tsx_async_abort=". The valid arguments for this option are:
============ =============================================================
off This option disables the TAA mitigation on affected platforms.
If the system has TSX enabled (see next parameter) and the CPU
is affected, the system is vulnerable.
full TAA mitigation is enabled. If TSX is enabled, on an affected
system it will clear CPU buffers on ring transitions. On
systems which are MDS-affected and deploy MDS mitigation,
TAA is also mitigated. Specifying this option on those
systems will have no effect.
full,nosmt The same as tsx_async_abort=full, with SMT disabled on
vulnerable CPUs that have TSX enabled. This is the complete
mitigation. When TSX is disabled, SMT is not disabled because
CPU is not vulnerable to cross-thread TAA attacks.
============ =============================================================
Not specifying this option is equivalent to "tsx_async_abort=full". For
processors that are affected by both TAA and MDS, specifying just
"tsx_async_abort=off" without an accompanying "mds=off" will have no
effect as the same mitigation is used for both vulnerabilities.
The kernel command line also allows to control the TSX feature using the
parameter "tsx=" on CPUs which support TSX control. MSR_IA32_TSX_CTRL is used
to control the TSX feature and the enumeration of the TSX feature bits (RTM
and HLE) in CPUID.
The valid options are:
============ =============================================================
off Disables TSX on the system.
Note that this option takes effect only on newer CPUs which are
not vulnerable to MDS, i.e., have MSR_IA32_ARCH_CAPABILITIES.MDS_NO=1
and which get the new IA32_TSX_CTRL MSR through a microcode
update. This new MSR allows for the reliable deactivation of
the TSX functionality.
on Enables TSX.
Although there are mitigations for all known security
vulnerabilities, TSX has been known to be an accelerator for
several previous speculation-related CVEs, and so there may be
unknown security risks associated with leaving it enabled.
auto Disables TSX if X86_BUG_TAA is present, otherwise enables TSX
on the system.
============ =============================================================
Not specifying this option is equivalent to "tsx=off".
The following combinations of the "tsx_async_abort" and "tsx" are possible. For
affected platforms tsx=auto is equivalent to tsx=off and the result will be:
========= ========================== =========================================
tsx=on tsx_async_abort=full The system will use VERW to clear CPU
buffers. Cross-thread attacks are still
possible on SMT machines.
tsx=on tsx_async_abort=full,nosmt As above, cross-thread attacks on SMT
mitigated.
tsx=on tsx_async_abort=off The system is vulnerable.
tsx=off tsx_async_abort=full TSX might be disabled if microcode
provides a TSX control MSR. If so,
system is not vulnerable.
tsx=off tsx_async_abort=full,nosmt Ditto
tsx=off tsx_async_abort=off ditto
========= ========================== =========================================
For unaffected platforms "tsx=on" and "tsx_async_abort=full" does not clear CPU
buffers. For platforms without TSX control (MSR_IA32_ARCH_CAPABILITIES.MDS_NO=0)
"tsx" command line argument has no effect.
For the affected platforms below table indicates the mitigation status for the
combinations of CPUID bit MD_CLEAR and IA32_ARCH_CAPABILITIES MSR bits MDS_NO
and TSX_CTRL_MSR.
======= ========= ============= ========================================
MDS_NO MD_CLEAR TSX_CTRL_MSR Status
======= ========= ============= ========================================
0 0 0 Vulnerable (needs microcode)
0 1 0 MDS and TAA mitigated via VERW
1 1 0 MDS fixed, TAA vulnerable if TSX enabled
because MD_CLEAR has no meaning and
VERW is not guaranteed to clear buffers
1 X 1 MDS fixed, TAA can be mitigated by
VERW or TSX_CTRL_MSR
======= ========= ============= ========================================
Mitigation selection guide
--------------------------
1. Trusted userspace and guests
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
If all user space applications are from a trusted source and do not execute
untrusted code which is supplied externally, then the mitigation can be
disabled. The same applies to virtualized environments with trusted guests.
2. Untrusted userspace and guests
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
If there are untrusted applications or guests on the system, enabling TSX
might allow a malicious actor to leak data from the host or from other
processes running on the same physical core.
If the microcode is available and the TSX is disabled on the host, attacks
are prevented in a virtualized environment as well, even if the VMs do not
explicitly enable the mitigation.
.. _taa_default_mitigations:
Default mitigations
-------------------
The kernel's default action for vulnerable processors is:
- Deploy TSX disable mitigation (tsx_async_abort=full tsx=off).

View file

@ -32,7 +32,7 @@ Supported chips:
Datasheet: Publicly available at the Texas Instruments website
http://www.ti.com/
Author: Lothar Felten <l-felten@ti.com>
Author: Lothar Felten <lothar.felten@gmail.com>
Description
-----------

View file

@ -19,6 +19,24 @@ Contents:
gpu/index
80211/index
This section describes CPU vulnerabilities and their mitigations.
.. toctree::
:maxdepth: 1
hw-vuln/index
Architecture-specific documentation
-----------------------------------
These books provide programming details about architecture-specific
implementation.
.. toctree::
:maxdepth: 2
x86/index
Indices and tables
==================

View file

@ -301,7 +301,10 @@ them as any other INPUT_PROP_BUTTONPAD device.
INPUT_PROP_ACCELEROMETER
-------------------------
Directional axes on this device (absolute and/or relative x, y, z) represent
accelerometer data. All other axes retain their meaning. A device must not mix
accelerometer data. Some devices also report gyroscope data, which devices
can report through the rotational axes (absolute and/or relative rx, ry, rz).
All other axes retain their meaning. A device must not mix
regular directional axes and accelerometer axes on the same event node.
Guidelines:

View file

@ -314,7 +314,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
This facility can be used to prevent such uncontrolled
GPE floodings.
Format: <int>
Support masking of GPEs numbered from 0x00 to 0x7f.
acpi_no_auto_serialize [HW,ACPI]
Disable auto-serialization of AML methods
@ -1090,12 +1089,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
nopku [X86] Disable Memory Protection Keys CPU feature found
in some Intel CPUs.
eagerfpu= [X86]
on enable eager fpu restore
off disable eager fpu restore
auto selects the default scheme, which automatically
enables eagerfpu restore for xsaveopt.
module.async_probe [KNL]
Enable asynchronous probe on this module.
@ -1984,6 +1977,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
kmemcheck=2 (one-shot mode)
Default: 2 (one-shot mode)
kpti= [ARM64] Control page table isolation of user
and kernel address spaces.
Default: enabled on cores which need mitigation.
0: force disabled
1: force enabled
kstack=N [X86] Print N words from the kernel stack
in oops dumps.
@ -1994,6 +1993,25 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
KVM MMU at runtime.
Default is 0 (off)
kvm.nx_huge_pages=
[KVM] Controls the software workaround for the
X86_BUG_ITLB_MULTIHIT bug.
force : Always deploy workaround.
off : Never deploy workaround.
auto : Deploy workaround based on the presence of
X86_BUG_ITLB_MULTIHIT.
Default is 'auto'.
If the software workaround is enabled for the host,
guests do need not to enable it for nested guests.
kvm.nx_huge_pages_recovery_ratio=
[KVM] Controls how many 4KiB pages are periodically zapped
back to huge pages. 0 disables the recovery, otherwise if
the value is N KVM will zap 1/Nth of the 4KiB pages every
minute. The default is 60.
kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM.
Default is 1 (enabled)
@ -2022,10 +2040,87 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
(virtualized real and unpaged mode) on capable
Intel chips. Default is 1 (enabled)
kvm-intel.vmentry_l1d_flush=[KVM,Intel] Mitigation for L1 Terminal Fault
CVE-2018-3620.
Valid arguments: never, cond, always
always: L1D cache flush on every VMENTER.
cond: Flush L1D on VMENTER only when the code between
VMEXIT and VMENTER can leak host memory.
never: Disables the mitigation
Default is cond (do L1 cache flush in specific instances)
kvm-intel.vpid= [KVM,Intel] Disable Virtual Processor Identification
feature (tagged TLBs) on capable Intel chips.
Default is 1 (enabled)
l1tf= [X86] Control mitigation of the L1TF vulnerability on
affected CPUs
The kernel PTE inversion protection is unconditionally
enabled and cannot be disabled.
full
Provides all available mitigations for the
L1TF vulnerability. Disables SMT and
enables all mitigations in the
hypervisors, i.e. unconditional L1D flush.
SMT control and L1D flush control via the
sysfs interface is still possible after
boot. Hypervisors will issue a warning
when the first VM is started in a
potentially insecure configuration,
i.e. SMT enabled or L1D flush disabled.
full,force
Same as 'full', but disables SMT and L1D
flush runtime control. Implies the
'nosmt=force' command line option.
(i.e. sysfs control of SMT is disabled.)
flush
Leaves SMT enabled and enables the default
hypervisor mitigation, i.e. conditional
L1D flush.
SMT control and L1D flush control via the
sysfs interface is still possible after
boot. Hypervisors will issue a warning
when the first VM is started in a
potentially insecure configuration,
i.e. SMT enabled or L1D flush disabled.
flush,nosmt
Disables SMT and enables the default
hypervisor mitigation.
SMT control and L1D flush control via the
sysfs interface is still possible after
boot. Hypervisors will issue a warning
when the first VM is started in a
potentially insecure configuration,
i.e. SMT enabled or L1D flush disabled.
flush,nowarn
Same as 'flush', but hypervisors will not
warn when a VM is started in a potentially
insecure configuration.
off
Disables hypervisor mitigations and doesn't
emit any warnings.
It also drops the swap size and available
RAM limit restriction on both hypervisor and
bare metal.
Default is 'flush'.
For details see: Documentation/hw-vuln/l1tf.rst
l2cr= [PPC]
l3cr= [PPC]
@ -2267,6 +2362,38 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
Format: <first>,<last>
Specifies range of consoles to be captured by the MDA.
mds= [X86,INTEL]
Control mitigation for the Micro-architectural Data
Sampling (MDS) vulnerability.
Certain CPUs are vulnerable to an exploit against CPU
internal buffers which can forward information to a
disclosure gadget under certain conditions.
In vulnerable processors, the speculatively
forwarded data can be used in a cache side channel
attack, to access data to which the attacker does
not have direct access.
This parameter controls the MDS mitigation. The
options are:
full - Enable MDS mitigation on vulnerable CPUs
full,nosmt - Enable MDS mitigation and disable
SMT on vulnerable CPUs
off - Unconditionally disable MDS mitigation
On TAA-affected machines, mds=off can be prevented by
an active TAA mitigation as both vulnerabilities are
mitigated with the same mechanism so in order to disable
this mitigation, you need to specify tsx_async_abort=off
too.
Not specifying this option is equivalent to
mds=full.
For details see: Documentation/hw-vuln/mds.rst
mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory
Amount of memory to be used when the kernel is not able
to see the whole system memory or for test.
@ -2389,6 +2516,47 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
in the "bleeding edge" mini2440 support kernel at
http://repo.or.cz/w/linux-2.6/mini2440.git
mitigations=
[X86] Control optional mitigations for CPU
vulnerabilities. This is a set of curated,
arch-independent options, each of which is an
aggregation of existing arch-specific options.
off
Disable all optional CPU mitigations. This
improves system performance, but it may also
expose users to several CPU vulnerabilities.
Equivalent to: nopti [X86]
nospectre_v1 [X86]
nospectre_v2 [X86]
spectre_v2_user=off [X86]
spec_store_bypass_disable=off [X86]
l1tf=off [X86]
mds=off [X86]
tsx_async_abort=off [X86]
kvm.nx_huge_pages=off [X86]
Exceptions:
This does not have any effect on
kvm.nx_huge_pages when
kvm.nx_huge_pages=force.
auto (default)
Mitigate all CPU vulnerabilities, but leave SMT
enabled, even if it's vulnerable. This is for
users who don't want to be surprised by SMT
getting disabled across kernel upgrades, or who
have other ways of avoiding SMT-based attacks.
Equivalent to: (default behavior)
auto,nosmt
Mitigate all CPU vulnerabilities, disabling SMT
if needed. This is for users who always want to
be fully mitigated, even if it means losing SMT.
Equivalent to: l1tf=flush,nosmt [X86]
mds=full,nosmt [X86]
tsx_async_abort=full,nosmt [X86]
mminit_loglevel=
[KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
parameter allows control of the logging verbosity for
@ -2706,7 +2874,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
nosmt [KNL,S390] Disable symmetric multithreading (SMT).
Equivalent to smt=1.
nospectre_v2 [X86] Disable all mitigations for the Spectre variant 2
[KNL,x86] Disable symmetric multithreading (SMT).
nosmt=force: Force disable SMT, cannot be undone
via the sysfs control file.
nospectre_v1 [X86,PPC] Disable mitigations for Spectre Variant 1
(bounds check bypass). With this option data leaks are
possible in the system.
nospectre_v2 [X86,PPC_FSL_BOOK3E] Disable all mitigations for the Spectre variant 2
(indirect branch prediction) vulnerability. System may
allow data leaks with this option, which is equivalent
to spectre_v2=off.
@ -3328,6 +3504,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
before loading.
See Documentation/blockdev/ramdisk.txt.
psi= [KNL] Enable or disable pressure stall information
tracking.
Format: <bool>
psmouse.proto= [HW,MOUSE] Highest PS2 mouse protocol extension to
probe for; one of (bare|imps|exps|lifebook|any).
psmouse.rate= [HW,MOUSE] Set desired mouse report rate, in reports
@ -3704,6 +3884,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
Run specified binary instead of /init from the ramdisk,
used for early userspace startup. See initrd.
rdrand= [X86]
force - Override the decision by the kernel to hide the
advertisement of RDRAND support (this affects
certain AMD processors because of buggy BIOS
support, specifically around the suspend/resume
path).
reboot= [KNL]
Format (x86 or x86_64):
[w[arm] | c[old] | h[ard] | s[oft] | g[pio]] \
@ -3908,6 +4095,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
last alloc / free. For more information see
Documentation/vm/slub.txt.
slub_memcg_sysfs= [MM, SLUB]
Determines whether to enable sysfs directories for
memory cgroup sub-caches. 1 to enable, 0 to disable.
The default is determined by CONFIG_SLUB_MEMCG_SYSFS_ON.
Enabling this can lead to a very high number of debug
directories and files being created under
/sys/kernel/slub.
slub_max_order= [MM, SLUB]
Determines the maximum allowed order for slabs.
A high setting may cause OOMs due to memory
@ -3967,9 +4162,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
spectre_v2= [X86] Control mitigation of Spectre variant 2
(indirect branch speculation) vulnerability.
The default operation protects the kernel from
user space attacks.
on - unconditionally enable
off - unconditionally disable
on - unconditionally enable, implies
spectre_v2_user=on
off - unconditionally disable, implies
spectre_v2_user=off
auto - kernel detects whether your CPU model is
vulnerable
@ -3979,6 +4178,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
CONFIG_RETPOLINE configuration option, and the
compiler with which the kernel was built.
Selecting 'on' will also enable the mitigation
against user space to user space task attacks.
Selecting 'off' will disable both the kernel and
the user space protections.
Specific mitigations can also be selected manually:
retpoline - replace indirect branches
@ -3988,6 +4193,48 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
Not specifying this option is equivalent to
spectre_v2=auto.
spectre_v2_user=
[X86] Control mitigation of Spectre variant 2
(indirect branch speculation) vulnerability between
user space tasks
on - Unconditionally enable mitigations. Is
enforced by spectre_v2=on
off - Unconditionally disable mitigations. Is
enforced by spectre_v2=off
prctl - Indirect branch speculation is enabled,
but mitigation can be enabled via prctl
per thread. The mitigation control state
is inherited on fork.
prctl,ibpb
- Like "prctl" above, but only STIBP is
controlled per thread. IBPB is issued
always when switching between different user
space processes.
seccomp
- Same as "prctl" above, but all seccomp
threads will enable the mitigation unless
they explicitly opt out.
seccomp,ibpb
- Like "seccomp" above, but only STIBP is
controlled per thread. IBPB is issued
always when switching between different
user space processes.
auto - Kernel selects the mitigation depending on
the available CPU features and vulnerability.
Default mitigation:
If CONFIG_SECCOMP=y then "seccomp", otherwise "prctl"
Not specifying this option is equivalent to
spectre_v2_user=auto.
spec_store_bypass_disable=
[HW] Control Speculative Store Bypass (SSB) Disable mitigation
(Speculative Store Bypass vulnerability)
@ -4332,6 +4579,76 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
platforms where RDTSC is slow and this accounting
can add overhead.
tsx= [X86] Control Transactional Synchronization
Extensions (TSX) feature in Intel processors that
support TSX control.
This parameter controls the TSX feature. The options are:
on - Enable TSX on the system. Although there are
mitigations for all known security vulnerabilities,
TSX has been known to be an accelerator for
several previous speculation-related CVEs, and
so there may be unknown security risks associated
with leaving it enabled.
off - Disable TSX on the system. (Note that this
option takes effect only on newer CPUs which are
not vulnerable to MDS, i.e., have
MSR_IA32_ARCH_CAPABILITIES.MDS_NO=1 and which get
the new IA32_TSX_CTRL MSR through a microcode
update. This new MSR allows for the reliable
deactivation of the TSX functionality.)
auto - Disable TSX if X86_BUG_TAA is present,
otherwise enable TSX on the system.
Not specifying this option is equivalent to tsx=off.
See Documentation/hw-vuln/tsx_async_abort.rst
for more details.
tsx_async_abort= [X86,INTEL] Control mitigation for the TSX Async
Abort (TAA) vulnerability.
Similar to Micro-architectural Data Sampling (MDS)
certain CPUs that support Transactional
Synchronization Extensions (TSX) are vulnerable to an
exploit against CPU internal buffers which can forward
information to a disclosure gadget under certain
conditions.
In vulnerable processors, the speculatively forwarded
data can be used in a cache side channel attack, to
access data to which the attacker does not have direct
access.
This parameter controls the TAA mitigation. The
options are:
full - Enable TAA mitigation on vulnerable CPUs
if TSX is enabled.
full,nosmt - Enable TAA mitigation and disable SMT on
vulnerable CPUs. If TSX is disabled, SMT
is not disabled because CPU is not
vulnerable to cross-thread TAA attacks.
off - Unconditionally disable TAA mitigation
On MDS-affected machines, tsx_async_abort=off can be
prevented by an active MDS mitigation as both vulnerabilities
are mitigated with the same mechanism so in order to disable
this mitigation, you need to specify mds=off too.
Not specifying this option is equivalent to
tsx_async_abort=full. On CPUs which are MDS affected
and deploy MDS mitigation, TAA mitigation is not
required and doesn't provide any additional
mitigation.
For details see:
Documentation/hw-vuln/tsx_async_abort.rst
turbografx.map[2|3]= [HW,JOY]
TurboGraFX parallel port interface
Format:

View file

@ -122,14 +122,11 @@ min_adv_mss - INTEGER
IP Fragmentation:
ipfrag_high_thresh - INTEGER
Maximum memory used to reassemble IP fragments. When
ipfrag_high_thresh bytes of memory is allocated for this purpose,
the fragment handler will toss packets until ipfrag_low_thresh
is reached. This also serves as a maximum limit to namespaces
different from the initial one.
ipfrag_high_thresh - LONG INTEGER
Maximum memory used to reassemble IP fragments.
ipfrag_low_thresh - INTEGER
ipfrag_low_thresh - LONG INTEGER
(Obsolete since linux-4.17)
Maximum memory used to reassemble IP fragments before the kernel
begins to remove incomplete fragment queues to free up resources.
The kernel still accepts new fragments for defragmentation.
@ -233,6 +230,14 @@ tcp_base_mss - INTEGER
Path MTU discovery (MTU probing). If MTU probing is enabled,
this is the initial MSS used by the connection.
tcp_min_snd_mss - INTEGER
TCP SYN and SYNACK messages usually advertise an ADVMSS option,
as described in RFC 1122 and RFC 6691.
If this ADVMSS option is smaller than tcp_min_snd_mss,
it is silently capped to tcp_min_snd_mss.
Default : 48 (at least 8 bytes of payload per segment)
tcp_congestion_control - STRING
Set the congestion control algorithm to be used for new
connections. The algorithm "reno" is always available, but
@ -408,6 +413,7 @@ tcp_min_rtt_wlen - INTEGER
minimum RTT when it is moved to a longer path (e.g., due to traffic
engineering). A longer window makes the filter more resistant to RTT
inflations such as transient congestion. The unit is seconds.
Possible values: 0 - 86400 (1 day)
Default: 300
tcp_moderate_rcvbuf - BOOLEAN

View file

@ -31,6 +31,15 @@ return from vsnprintf.
Raw pointer value SHOULD be printed with %p. The kernel supports
the following extended format specifiers for pointer types:
Pointer Types:
Pointers printed without a specifier extension (i.e unadorned %p) are
hashed to give a unique identifier without leaking kernel addresses to user
space. On 64 bit machines the first 32 bits are zeroed. If you _really_
want the address see %px below.
%p abcdef12 or 00000000abcdef12
Symbols/Function Pointers:
%pF versatile_init+0x0/0x110
@ -58,12 +67,24 @@ Symbols/Function Pointers:
Kernel Pointers:
%pK 0x01234567 or 0x0123456789abcdef
%pK 01234567 or 0123456789abcdef
For printing kernel pointers which should be hidden from unprivileged
users. The behaviour of %pK depends on the kptr_restrict sysctl - see
Documentation/sysctl/kernel.txt for more details.
Unmodified Addresses:
%px 01234567 or 0123456789abcdef
For printing pointers when you _really_ want to print the address. Please
consider whether or not you are leaking sensitive information about the
Kernel layout in memory before printing pointers with %px. %px is
functionally equivalent to %lx. %px is preferred to %lx because it is more
uniquely grep'able. If, in the future, we need to modify the way the Kernel
handles printing pointers it will be nice to be able to find the call
sites.
Struct Resources:
%pr [mem 0x60000000-0x6fffffff flags 0x2200] or

175
Documentation/siphash.txt Normal file
View file

@ -0,0 +1,175 @@
SipHash - a short input PRF
-----------------------------------------------
Written by Jason A. Donenfeld <jason@zx2c4.com>
SipHash is a cryptographically secure PRF -- a keyed hash function -- that
performs very well for short inputs, hence the name. It was designed by
cryptographers Daniel J. Bernstein and Jean-Philippe Aumasson. It is intended
as a replacement for some uses of: `jhash`, `md5_transform`, `sha_transform`,
and so forth.
SipHash takes a secret key filled with randomly generated numbers and either
an input buffer or several input integers. It spits out an integer that is
indistinguishable from random. You may then use that integer as part of secure
sequence numbers, secure cookies, or mask it off for use in a hash table.
1. Generating a key
Keys should always be generated from a cryptographically secure source of
random numbers, either using get_random_bytes or get_random_once:
siphash_key_t key;
get_random_bytes(&key, sizeof(key));
If you're not deriving your key from here, you're doing it wrong.
2. Using the functions
There are two variants of the function, one that takes a list of integers, and
one that takes a buffer:
u64 siphash(const void *data, size_t len, const siphash_key_t *key);
And:
u64 siphash_1u64(u64, const siphash_key_t *key);
u64 siphash_2u64(u64, u64, const siphash_key_t *key);
u64 siphash_3u64(u64, u64, u64, const siphash_key_t *key);
u64 siphash_4u64(u64, u64, u64, u64, const siphash_key_t *key);
u64 siphash_1u32(u32, const siphash_key_t *key);
u64 siphash_2u32(u32, u32, const siphash_key_t *key);
u64 siphash_3u32(u32, u32, u32, const siphash_key_t *key);
u64 siphash_4u32(u32, u32, u32, u32, const siphash_key_t *key);
If you pass the generic siphash function something of a constant length, it
will constant fold at compile-time and automatically choose one of the
optimized functions.
3. Hashtable key function usage:
struct some_hashtable {
DECLARE_HASHTABLE(hashtable, 8);
siphash_key_t key;
};
void init_hashtable(struct some_hashtable *table)
{
get_random_bytes(&table->key, sizeof(table->key));
}
static inline hlist_head *some_hashtable_bucket(struct some_hashtable *table, struct interesting_input *input)
{
return &table->hashtable[siphash(input, sizeof(*input), &table->key) & (HASH_SIZE(table->hashtable) - 1)];
}
You may then iterate like usual over the returned hash bucket.
4. Security
SipHash has a very high security margin, with its 128-bit key. So long as the
key is kept secret, it is impossible for an attacker to guess the outputs of
the function, even if being able to observe many outputs, since 2^128 outputs
is significant.
Linux implements the "2-4" variant of SipHash.
5. Struct-passing Pitfalls
Often times the XuY functions will not be large enough, and instead you'll
want to pass a pre-filled struct to siphash. When doing this, it's important
to always ensure the struct has no padding holes. The easiest way to do this
is to simply arrange the members of the struct in descending order of size,
and to use offsetendof() instead of sizeof() for getting the size. For
performance reasons, if possible, it's probably a good thing to align the
struct to the right boundary. Here's an example:
const struct {
struct in6_addr saddr;
u32 counter;
u16 dport;
} __aligned(SIPHASH_ALIGNMENT) combined = {
.saddr = *(struct in6_addr *)saddr,
.counter = counter,
.dport = dport
};
u64 h = siphash(&combined, offsetofend(typeof(combined), dport), &secret);
6. Resources
Read the SipHash paper if you're interested in learning more:
https://131002.net/siphash/siphash.pdf
~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~
HalfSipHash - SipHash's insecure younger cousin
-----------------------------------------------
Written by Jason A. Donenfeld <jason@zx2c4.com>
On the off-chance that SipHash is not fast enough for your needs, you might be
able to justify using HalfSipHash, a terrifying but potentially useful
possibility. HalfSipHash cuts SipHash's rounds down from "2-4" to "1-3" and,
even scarier, uses an easily brute-forcable 64-bit key (with a 32-bit output)
instead of SipHash's 128-bit key. However, this may appeal to some
high-performance `jhash` users.
Danger!
Do not ever use HalfSipHash except for as a hashtable key function, and only
then when you can be absolutely certain that the outputs will never be
transmitted out of the kernel. This is only remotely useful over `jhash` as a
means of mitigating hashtable flooding denial of service attacks.
1. Generating a key
Keys should always be generated from a cryptographically secure source of
random numbers, either using get_random_bytes or get_random_once:
hsiphash_key_t key;
get_random_bytes(&key, sizeof(key));
If you're not deriving your key from here, you're doing it wrong.
2. Using the functions
There are two variants of the function, one that takes a list of integers, and
one that takes a buffer:
u32 hsiphash(const void *data, size_t len, const hsiphash_key_t *key);
And:
u32 hsiphash_1u32(u32, const hsiphash_key_t *key);
u32 hsiphash_2u32(u32, u32, const hsiphash_key_t *key);
u32 hsiphash_3u32(u32, u32, u32, const hsiphash_key_t *key);
u32 hsiphash_4u32(u32, u32, u32, u32, const hsiphash_key_t *key);
If you pass the generic hsiphash function something of a constant length, it
will constant fold at compile-time and automatically choose one of the
optimized functions.
3. Hashtable key function usage:
struct some_hashtable {
DECLARE_HASHTABLE(hashtable, 8);
hsiphash_key_t key;
};
void init_hashtable(struct some_hashtable *table)
{
get_random_bytes(&table->key, sizeof(table->key));
}
static inline hlist_head *some_hashtable_bucket(struct some_hashtable *table, struct interesting_input *input)
{
return &table->hashtable[hsiphash(input, sizeof(*input), &table->key) & (HASH_SIZE(table->hashtable) - 1)];
}
You may then iterate like usual over the returned hash bucket.
4. Performance
HalfSipHash is roughly 3 times slower than JenkinsHash. For many replacements,
this will not be a problem, as the hashtable lookup isn't the bottleneck. And
in general, this is probably a good sacrifice to make for the security and DoS
resistance of HalfSipHash.

View file

@ -92,3 +92,12 @@ Speculation misfeature controls
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_ENABLE, 0, 0);
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_DISABLE, 0, 0);
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_FORCE_DISABLE, 0, 0);
- PR_SPEC_INDIR_BRANCH: Indirect Branch Speculation in User Processes
(Mitigate Spectre V2 style attacks against user processes)
Invocations:
* prctl(PR_GET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, 0, 0, 0);
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_ENABLE, 0, 0);
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_DISABLE, 0, 0);
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_FORCE_DISABLE, 0, 0);

View file

@ -34,7 +34,9 @@ Currently, these files are in /proc/sys/fs:
- overflowgid
- pipe-user-pages-hard
- pipe-user-pages-soft
- protected_fifos
- protected_hardlinks
- protected_regular
- protected_symlinks
- suid_dumpable
- super-max
@ -182,6 +184,24 @@ applied.
==============================================================
protected_fifos:
The intent of this protection is to avoid unintentional writes to
an attacker-controlled FIFO, where a program expected to create a regular
file.
When set to "0", writing to FIFOs is unrestricted.
When set to "1" don't allow O_CREAT open on FIFOs that we don't own
in world writable sticky directories, unless they are owned by the
owner of the directory.
When set to "2" it also applies to group writable sticky directories.
This protection is based on the restrictions in Openwall.
==============================================================
protected_hardlinks:
A long-standing class of security issues is the hardlink-based
@ -202,6 +222,22 @@ This protection is based on the restrictions in Openwall and grsecurity.
==============================================================
protected_regular:
This protection is similar to protected_fifos, but it
avoids writes to an attacker-controlled regular file, where a program
expected to create one.
When set to "0", writing to regular files is unrestricted.
When set to "1" don't allow O_CREAT open on regular files that we
don't own in world writable sticky directories, unless they are
owned by the owner of the directory.
When set to "2" it also applies to group writable sticky directories.
==============================================================
protected_symlinks:
A long-standing class of security issues is the symlink-based

View file

@ -54,6 +54,14 @@ Values :
1 - enable JIT hardening for unprivileged users only
2 - enable JIT hardening for all users
bpf_jit_limit
-------------
This enforces a global limit for memory allocations to the BPF JIT
compiler in order to reject unprivileged JIT requests once it has
been surpassed. bpf_jit_limit contains the value of the global limit
in bytes.
dev_weight
--------------

View file

@ -365,11 +365,15 @@ autosuspend the interface's device. When the usage counter is = 0
then the interface is considered to be idle, and the kernel may
autosuspend the device.
Drivers need not be concerned about balancing changes to the usage
counter; the USB core will undo any remaining "get"s when a driver
is unbound from its interface. As a corollary, drivers must not call
any of the usb_autopm_* functions after their disconnect() routine has
returned.
Drivers must be careful to balance their overall changes to the usage
counter. Unbalanced "get"s will remain in effect when a driver is
unbound from its interface, preventing the device from going into
runtime suspend should the interface be bound to a driver again. On
the other hand, drivers are allowed to achieve this balance by calling
the ``usb_autopm_*`` functions even after their ``disconnect`` routine
has returned -- say from within a work-queue routine -- provided they
retain an active reference to the interface (via ``usb_get_intf`` and
``usb_put_intf``).
Drivers using the async routines are responsible for their own
synchronization and mutual exclusion.

View file

@ -1,138 +0,0 @@
Copyright (C) 1999, 2000 Bruce Tenison
Portions Copyright (C) 1999, 2000 David Nelson
Thanks to David Nelson for guidance and the usage of the scanner.txt
and scanner.c files to model our driver and this informative file.
Mar. 2, 2000
CHANGES
- Initial Revision
OVERVIEW
This README will address issues regarding how to configure the kernel
to access a RIO 500 mp3 player.
Before I explain how to use this to access the Rio500 please be warned:
W A R N I N G:
--------------
Please note that this software is still under development. The authors
are in no way responsible for any damage that may occur, no matter how
inconsequential.
It seems that the Rio has a problem when sending .mp3 with low batteries.
I suggest when the batteries are low and you want to transfer stuff that you
replace it with a fresh one. In my case, what happened is I lost two 16kb
blocks (they are no longer usable to store information to it). But I don't
know if that's normal or not; it could simply be a problem with the flash
memory.
In an extreme case, I left my Rio playing overnight and the batteries wore
down to nothing and appear to have corrupted the flash memory. My RIO
needed to be replaced as a result. Diamond tech support is aware of the
problem. Do NOT allow your batteries to wear down to nothing before
changing them. It appears RIO 500 firmware does not handle low battery
power well at all.
On systems with OHCI controllers, the kernel OHCI code appears to have
power on problems with some chipsets. If you are having problems
connecting to your RIO 500, try turning it on first and then plugging it
into the USB cable.
Contact information:
--------------------
The main page for the project is hosted at sourceforge.net in the following
URL: <http://rio500.sourceforge.net>. You can also go to the project's
sourceforge home page at: <http://sourceforge.net/projects/rio500/>.
There is also a mailing list: rio500-users@lists.sourceforge.net
Authors:
-------
Most of the code was written by Cesar Miquel <miquel@df.uba.ar>. Keith
Clayton <kclayton@jps.net> is incharge of the PPC port and making sure
things work there. Bruce Tenison <btenison@dibbs.net> is adding support
for .fon files and also does testing. The program will mostly sure be
re-written and Pete Ikusz along with the rest will re-design it. I would
also like to thank Tri Nguyen <tmn_3022000@hotmail.com> who provided use
with some important information regarding the communication with the Rio.
ADDITIONAL INFORMATION and Userspace tools
http://rio500.sourceforge.net/
REQUIREMENTS
A host with a USB port. Ideally, either a UHCI (Intel) or OHCI
(Compaq and others) hardware port should work.
A Linux development kernel (2.3.x) with USB support enabled or a
backported version to linux-2.2.x. See http://www.linux-usb.org for
more information on accomplishing this.
A Linux kernel with RIO 500 support enabled.
'lspci' which is only needed to determine the type of USB hardware
available in your machine.
CONFIGURATION
Using `lspci -v`, determine the type of USB hardware available.
If you see something like:
USB Controller: ......
Flags: .....
I/O ports at ....
Then you have a UHCI based controller.
If you see something like:
USB Controller: .....
Flags: ....
Memory at .....
Then you have a OHCI based controller.
Using `make menuconfig` or your preferred method for configuring the
kernel, select 'Support for USB', 'OHCI/UHCI' depending on your
hardware (determined from the steps above), 'USB Diamond Rio500 support', and
'Preliminary USB device filesystem'. Compile and install the modules
(you may need to execute `depmod -a` to update the module
dependencies).
Add a device for the USB rio500:
`mknod /dev/usb/rio500 c 180 64`
Set appropriate permissions for /dev/usb/rio500 (don't forget about
group and world permissions). Both read and write permissions are
required for proper operation.
Load the appropriate modules (if compiled as modules):
OHCI:
modprobe usbcore
modprobe usb-ohci
modprobe rio500
UHCI:
modprobe usbcore
modprobe usb-uhci (or uhci)
modprobe rio500
That's it. The Rio500 Utils at: http://rio500.sourceforge.net should
be able to access the rio500.
BUGS
If you encounter any problems feel free to drop me an email.
Bruce Tenison
btenison@dibbs.net

View file

@ -13,7 +13,7 @@ of a virtual machine. The ioctls belong to three classes
- VM ioctls: These query and set attributes that affect an entire virtual
machine, for example memory layout. In addition a VM ioctl is used to
create virtual cpus (vcpus).
create virtual cpus (vcpus) and devices.
Only run VM ioctls from the same process (address space) that was used
to create the VM.
@ -24,6 +24,11 @@ of a virtual machine. The ioctls belong to three classes
Only run vcpu ioctls from the same thread that was used to create the
vcpu.
- device ioctls: These query and set attributes that control the operation
of a single device.
device ioctls must be issued from the same process (address space) that
was used to create the VM.
2. File descriptors
-------------------
@ -32,10 +37,11 @@ The kvm API is centered around file descriptors. An initial
open("/dev/kvm") obtains a handle to the kvm subsystem; this handle
can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this
handle will create a VM file descriptor which can be used to issue VM
ioctls. A KVM_CREATE_VCPU ioctl on a VM fd will create a virtual cpu
and return a file descriptor pointing to it. Finally, ioctls on a vcpu
fd can be used to control the vcpu, including the important task of
actually running guest code.
ioctls. A KVM_CREATE_VCPU or KVM_CREATE_DEVICE ioctl on a VM fd will
create a virtual cpu or device and return a file descriptor pointing to
the new resource. Finally, ioctls on a vcpu or device fd can be used
to control the vcpu or device. For vcpus, this includes the important
task of actually running guest code.
In general file descriptors can be migrated among processes by means
of fork() and the SCM_RIGHTS facility of unix domain socket. These
@ -122,14 +128,15 @@ KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as
privileged user (CAP_SYS_ADMIN).
4.3 KVM_GET_MSR_INDEX_LIST
4.3 KVM_GET_MSR_INDEX_LIST, KVM_GET_MSR_FEATURE_INDEX_LIST
Capability: basic
Capability: basic, KVM_CAP_GET_MSR_FEATURES for KVM_GET_MSR_FEATURE_INDEX_LIST
Architectures: x86
Type: system
Type: system ioctl
Parameters: struct kvm_msr_list (in/out)
Returns: 0 on success; -1 on error
Errors:
EFAULT: the msr index list cannot be read from or written to
E2BIG: the msr index list is to be to fit in the array specified by
the user.
@ -138,16 +145,23 @@ struct kvm_msr_list {
__u32 indices[0];
};
This ioctl returns the guest msrs that are supported. The list varies
by kvm version and host processor, but does not change otherwise. The
user fills in the size of the indices array in nmsrs, and in return
kvm adjusts nmsrs to reflect the actual number of msrs and fills in
the indices array with their numbers.
The user fills in the size of the indices array in nmsrs, and in return
kvm adjusts nmsrs to reflect the actual number of msrs and fills in the
indices array with their numbers.
KVM_GET_MSR_INDEX_LIST returns the guest msrs that are supported. The list
varies by kvm version and host processor, but does not change otherwise.
Note: if kvm indicates supports MCE (KVM_CAP_MCE), then the MCE bank MSRs are
not returned in the MSR list, as different vcpus can have a different number
of banks, as set via the KVM_X86_SETUP_MCE ioctl.
KVM_GET_MSR_FEATURE_INDEX_LIST returns the list of MSRs that can be passed
to the KVM_GET_MSRS system ioctl. This lets userspace probe host capabilities
and processor features that are exposed via MSRs (e.g., VMX capabilities).
This list also varies by kvm version and host processor, but does not change
otherwise.
4.4 KVM_CHECK_EXTENSION
@ -474,14 +488,22 @@ Support for this has been removed. Use KVM_SET_GUEST_DEBUG instead.
4.18 KVM_GET_MSRS
Capability: basic
Capability: basic (vcpu), KVM_CAP_GET_MSR_FEATURES (system)
Architectures: x86
Type: vcpu ioctl
Type: system ioctl, vcpu ioctl
Parameters: struct kvm_msrs (in/out)
Returns: 0 on success, -1 on error
Returns: number of msrs successfully returned;
-1 on error
When used as a system ioctl:
Reads the values of MSR-based features that are available for the VM. This
is similar to KVM_GET_SUPPORTED_CPUID, but it returns MSR indices and values.
The list of msr-based features can be obtained using KVM_GET_MSR_FEATURE_INDEX_LIST
in a system ioctl.
When used as a vcpu ioctl:
Reads model-specific registers from the vcpu. Supported msr indices can
be obtained using KVM_GET_MSR_INDEX_LIST.
be obtained using KVM_GET_MSR_INDEX_LIST in a system ioctl.
struct kvm_msrs {
__u32 nmsrs; /* number of msrs in entries */

View file

@ -13,8 +13,8 @@ The acquisition orders for mutexes are as follows:
- kvm->slots_lock is taken outside kvm->irq_lock, though acquiring
them together is quite rare.
For spinlocks, kvm_lock is taken outside kvm->mmu_lock. Everything
else is a leaf: no other lock is taken inside the critical sections.
Everything else is a leaf: no other lock is taken inside the critical
sections.
2: Exception
------------
@ -142,7 +142,7 @@ See the comments in spte_has_volatile_bits() and mmu_spte_update().
------------
Name: kvm_lock
Type: spinlock_t
Type: mutex
Arch: any
Protects: - vm_list

10
Documentation/x86/conf.py Normal file
View file

@ -0,0 +1,10 @@
# -*- coding: utf-8; mode: python -*-
project = "X86 architecture specific documentation"
tags.add("subproject")
latex_documents = [
('index', 'x86.tex', project,
'The kernel development community', 'manual'),
]

View file

@ -0,0 +1,9 @@
==========================
x86 architecture specifics
==========================
.. toctree::
:maxdepth: 1
mds
tsx_async_abort

193
Documentation/x86/mds.rst Normal file
View file

@ -0,0 +1,193 @@
Microarchitectural Data Sampling (MDS) mitigation
=================================================
.. _mds:
Overview
--------
Microarchitectural Data Sampling (MDS) is a family of side channel attacks
on internal buffers in Intel CPUs. The variants are:
- Microarchitectural Store Buffer Data Sampling (MSBDS) (CVE-2018-12126)
- Microarchitectural Fill Buffer Data Sampling (MFBDS) (CVE-2018-12130)
- Microarchitectural Load Port Data Sampling (MLPDS) (CVE-2018-12127)
- Microarchitectural Data Sampling Uncacheable Memory (MDSUM) (CVE-2019-11091)
MSBDS leaks Store Buffer Entries which can be speculatively forwarded to a
dependent load (store-to-load forwarding) as an optimization. The forward
can also happen to a faulting or assisting load operation for a different
memory address, which can be exploited under certain conditions. Store
buffers are partitioned between Hyper-Threads so cross thread forwarding is
not possible. But if a thread enters or exits a sleep state the store
buffer is repartitioned which can expose data from one thread to the other.
MFBDS leaks Fill Buffer Entries. Fill buffers are used internally to manage
L1 miss situations and to hold data which is returned or sent in response
to a memory or I/O operation. Fill buffers can forward data to a load
operation and also write data to the cache. When the fill buffer is
deallocated it can retain the stale data of the preceding operations which
can then be forwarded to a faulting or assisting load operation, which can
be exploited under certain conditions. Fill buffers are shared between
Hyper-Threads so cross thread leakage is possible.
MLPDS leaks Load Port Data. Load ports are used to perform load operations
from memory or I/O. The received data is then forwarded to the register
file or a subsequent operation. In some implementations the Load Port can
contain stale data from a previous operation which can be forwarded to
faulting or assisting loads under certain conditions, which again can be
exploited eventually. Load ports are shared between Hyper-Threads so cross
thread leakage is possible.
MDSUM is a special case of MSBDS, MFBDS and MLPDS. An uncacheable load from
memory that takes a fault or assist can leave data in a microarchitectural
structure that may later be observed using one of the same methods used by
MSBDS, MFBDS or MLPDS.
Exposure assumptions
--------------------
It is assumed that attack code resides in user space or in a guest with one
exception. The rationale behind this assumption is that the code construct
needed for exploiting MDS requires:
- to control the load to trigger a fault or assist
- to have a disclosure gadget which exposes the speculatively accessed
data for consumption through a side channel.
- to control the pointer through which the disclosure gadget exposes the
data
The existence of such a construct in the kernel cannot be excluded with
100% certainty, but the complexity involved makes it extremly unlikely.
There is one exception, which is untrusted BPF. The functionality of
untrusted BPF is limited, but it needs to be thoroughly investigated
whether it can be used to create such a construct.
Mitigation strategy
-------------------
All variants have the same mitigation strategy at least for the single CPU
thread case (SMT off): Force the CPU to clear the affected buffers.
This is achieved by using the otherwise unused and obsolete VERW
instruction in combination with a microcode update. The microcode clears
the affected CPU buffers when the VERW instruction is executed.
For virtualization there are two ways to achieve CPU buffer
clearing. Either the modified VERW instruction or via the L1D Flush
command. The latter is issued when L1TF mitigation is enabled so the extra
VERW can be avoided. If the CPU is not affected by L1TF then VERW needs to
be issued.
If the VERW instruction with the supplied segment selector argument is
executed on a CPU without the microcode update there is no side effect
other than a small number of pointlessly wasted CPU cycles.
This does not protect against cross Hyper-Thread attacks except for MSBDS
which is only exploitable cross Hyper-thread when one of the Hyper-Threads
enters a C-state.
The kernel provides a function to invoke the buffer clearing:
mds_clear_cpu_buffers()
The mitigation is invoked on kernel/userspace, hypervisor/guest and C-state
(idle) transitions.
As a special quirk to address virtualization scenarios where the host has
the microcode updated, but the hypervisor does not (yet) expose the
MD_CLEAR CPUID bit to guests, the kernel issues the VERW instruction in the
hope that it might actually clear the buffers. The state is reflected
accordingly.
According to current knowledge additional mitigations inside the kernel
itself are not required because the necessary gadgets to expose the leaked
data cannot be controlled in a way which allows exploitation from malicious
user space or VM guests.
Kernel internal mitigation modes
--------------------------------
======= ============================================================
off Mitigation is disabled. Either the CPU is not affected or
mds=off is supplied on the kernel command line
full Mitigation is enabled. CPU is affected and MD_CLEAR is
advertised in CPUID.
vmwerv Mitigation is enabled. CPU is affected and MD_CLEAR is not
advertised in CPUID. That is mainly for virtualization
scenarios where the host has the updated microcode but the
hypervisor does not expose MD_CLEAR in CPUID. It's a best
effort approach without guarantee.
======= ============================================================
If the CPU is affected and mds=off is not supplied on the kernel command
line then the kernel selects the appropriate mitigation mode depending on
the availability of the MD_CLEAR CPUID bit.
Mitigation points
-----------------
1. Return to user space
^^^^^^^^^^^^^^^^^^^^^^^
When transitioning from kernel to user space the CPU buffers are flushed
on affected CPUs when the mitigation is not disabled on the kernel
command line. The migitation is enabled through the static key
mds_user_clear.
The mitigation is invoked in prepare_exit_to_usermode() which covers
all but one of the kernel to user space transitions. The exception
is when we return from a Non Maskable Interrupt (NMI), which is
handled directly in do_nmi().
(The reason that NMI is special is that prepare_exit_to_usermode() can
enable IRQs. In NMI context, NMIs are blocked, and we don't want to
enable IRQs with NMIs blocked.)
2. C-State transition
^^^^^^^^^^^^^^^^^^^^^
When a CPU goes idle and enters a C-State the CPU buffers need to be
cleared on affected CPUs when SMT is active. This addresses the
repartitioning of the store buffer when one of the Hyper-Threads enters
a C-State.
When SMT is inactive, i.e. either the CPU does not support it or all
sibling threads are offline CPU buffer clearing is not required.
The idle clearing is enabled on CPUs which are only affected by MSBDS
and not by any other MDS variant. The other MDS variants cannot be
protected against cross Hyper-Thread attacks because the Fill Buffer and
the Load Ports are shared. So on CPUs affected by other variants, the
idle clearing would be a window dressing exercise and is therefore not
activated.
The invocation is controlled by the static key mds_idle_clear which is
switched depending on the chosen mitigation mode and the SMT state of
the system.
The buffer clear is only invoked before entering the C-State to prevent
that stale data from the idling CPU from spilling to the Hyper-Thread
sibling after the store buffer got repartitioned and all entries are
available to the non idle sibling.
When coming out of idle the store buffer is partitioned again so each
sibling has half of it available. The back from idle CPU could be then
speculatively exposed to contents of the sibling. The buffers are
flushed either on exit to user space or on VMENTER so malicious code
in user space or the guest cannot speculatively access them.
The mitigation is hooked into all variants of halt()/mwait(), but does
not cover the legacy ACPI IO-Port mechanism because the ACPI idle driver
has been superseded by the intel_idle driver around 2010 and is
preferred on all affected CPUs which are expected to gain the MD_CLEAR
functionality in microcode. Aside of that the IO-Port mechanism is a
legacy interface which is only used on older systems which are either
not affected or do not receive microcode updates anymore.

View file

@ -0,0 +1,117 @@
.. SPDX-License-Identifier: GPL-2.0
TSX Async Abort (TAA) mitigation
================================
.. _tsx_async_abort:
Overview
--------
TSX Async Abort (TAA) is a side channel attack on internal buffers in some
Intel processors similar to Microachitectural Data Sampling (MDS). In this
case certain loads may speculatively pass invalid data to dependent operations
when an asynchronous abort condition is pending in a Transactional
Synchronization Extensions (TSX) transaction. This includes loads with no
fault or assist condition. Such loads may speculatively expose stale data from
the same uarch data structures as in MDS, with same scope of exposure i.e.
same-thread and cross-thread. This issue affects all current processors that
support TSX.
Mitigation strategy
-------------------
a) TSX disable - one of the mitigations is to disable TSX. A new MSR
IA32_TSX_CTRL will be available in future and current processors after
microcode update which can be used to disable TSX. In addition, it
controls the enumeration of the TSX feature bits (RTM and HLE) in CPUID.
b) Clear CPU buffers - similar to MDS, clearing the CPU buffers mitigates this
vulnerability. More details on this approach can be found in
:ref:`Documentation/hw-vuln/mds.rst <mds>`.
Kernel internal mitigation modes
--------------------------------
============= ============================================================
off Mitigation is disabled. Either the CPU is not affected or
tsx_async_abort=off is supplied on the kernel command line.
tsx disabled Mitigation is enabled. TSX feature is disabled by default at
bootup on processors that support TSX control.
verw Mitigation is enabled. CPU is affected and MD_CLEAR is
advertised in CPUID.
ucode needed Mitigation is enabled. CPU is affected and MD_CLEAR is not
advertised in CPUID. That is mainly for virtualization
scenarios where the host has the updated microcode but the
hypervisor does not expose MD_CLEAR in CPUID. It's a best
effort approach without guarantee.
============= ============================================================
If the CPU is affected and the "tsx_async_abort" kernel command line parameter is
not provided then the kernel selects an appropriate mitigation depending on the
status of RTM and MD_CLEAR CPUID bits.
Below tables indicate the impact of tsx=on|off|auto cmdline options on state of
TAA mitigation, VERW behavior and TSX feature for various combinations of
MSR_IA32_ARCH_CAPABILITIES bits.
1. "tsx=off"
========= ========= ============ ============ ============== =================== ======================
MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=off
---------------------------------- -------------------------------------------------------------------------
TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation
after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full
========= ========= ============ ============ ============== =================== ======================
0 0 0 HW default Yes Same as MDS Same as MDS
0 0 1 Invalid case Invalid case Invalid case Invalid case
0 1 0 HW default No Need ucode update Need ucode update
0 1 1 Disabled Yes TSX disabled TSX disabled
1 X 1 Disabled X None needed None needed
========= ========= ============ ============ ============== =================== ======================
2. "tsx=on"
========= ========= ============ ============ ============== =================== ======================
MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=on
---------------------------------- -------------------------------------------------------------------------
TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation
after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full
========= ========= ============ ============ ============== =================== ======================
0 0 0 HW default Yes Same as MDS Same as MDS
0 0 1 Invalid case Invalid case Invalid case Invalid case
0 1 0 HW default No Need ucode update Need ucode update
0 1 1 Enabled Yes None Same as MDS
1 X 1 Enabled X None needed None needed
========= ========= ============ ============ ============== =================== ======================
3. "tsx=auto"
========= ========= ============ ============ ============== =================== ======================
MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=auto
---------------------------------- -------------------------------------------------------------------------
TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation
after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full
========= ========= ============ ============ ============== =================== ======================
0 0 0 HW default Yes Same as MDS Same as MDS
0 0 1 Invalid case Invalid case Invalid case Invalid case
0 1 0 HW default No Need ucode update Need ucode update
0 1 1 Disabled Yes TSX disabled TSX disabled
1 X 1 Enabled X None needed None needed
========= ========= ============ ============ ============== =================== ======================
In the tables, TSX_CTRL_MSR is a new bit in MSR_IA32_ARCH_CAPABILITIES that
indicates whether MSR_IA32_TSX_CTRL is supported.
There are two control bits in IA32_TSX_CTRL MSR:
Bit 0: When set it disables the Restricted Transactional Memory (RTM)
sub-feature of TSX (will force all transactions to abort on the
XBEGIN instruction).
Bit 1: When set it disables the enumeration of the RTM and HLE feature
(i.e. it will make CPUID(EAX=7).EBX{bit4} and
CPUID(EAX=7).EBX{bit11} read as 0).

View file

@ -11081,6 +11081,13 @@ F: arch/arm/mach-s3c24xx/mach-bast.c
F: arch/arm/mach-s3c24xx/bast-ide.c
F: arch/arm/mach-s3c24xx/bast-irq.c
SIPHASH PRF ROUTINES
M: Jason A. Donenfeld <Jason@zx2c4.com>
S: Maintained
F: lib/siphash.c
F: lib/test_siphash.c
F: include/linux/siphash.h
TI DAVINCI MACHINE SUPPORT
M: Sekhar Nori <nsekhar@ti.com>
M: Kevin Hilman <khilman@kernel.org>
@ -11482,6 +11489,7 @@ F: arch/alpha/kernel/srm_env.c
STABLE BRANCH
M: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
M: Sasha Levin <sashal@kernel.org>
L: stable@vger.kernel.org
S: Supported
F: Documentation/stable_kernel_rules.txt
@ -12478,13 +12486,6 @@ W: http://www.linux-usb.org/usbnet
S: Maintained
F: drivers/net/usb/dm9601.c
USB DIAMOND RIO500 DRIVER
M: Cesar Miquel <miquel@df.uba.ar>
L: rio500-users@lists.sourceforge.net
W: http://rio500.sourceforge.net
S: Maintained
F: drivers/usb/misc/rio500*
USB EHCI DRIVER
M: Alan Stern <stern@rowland.harvard.edu>
L: linux-usb@vger.kernel.org

View file

@ -1,6 +1,6 @@
VERSION = 4
PATCHLEVEL = 9
SUBLEVEL = 118
SUBLEVEL = 212
EXTRAVERSION =
NAME = Roaring Lionus
@ -309,11 +309,6 @@ HOSTCXX = g++
HOSTCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89
HOSTCXXFLAGS = -O2
ifeq ($(shell $(HOSTCC) -v 2>&1 | grep -c "clang version"), 1)
HOSTCFLAGS += -Wno-unused-value -Wno-unused-parameter \
-Wno-missing-field-initializers -fno-delete-null-pointer-checks
endif
# Decide whether to build built-in, modular, or both.
# Normally, just do built-in.
@ -398,7 +393,7 @@ LINUXINCLUDE += $(filter-out $(LINUXINCLUDE),$(USERINCLUDE))
KBUILD_AFLAGS := -D__ASSEMBLY__
KBUILD_CFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
-fno-strict-aliasing -fno-common \
-fno-strict-aliasing -fno-common -fshort-wchar \
-Werror-implicit-function-declaration \
-Wno-format-security \
-Werror \
@ -410,6 +405,7 @@ KBUILD_AFLAGS_MODULE := -DMODULE
KBUILD_CFLAGS_MODULE := -DMODULE
KBUILD_LDFLAGS_MODULE := -T $(srctree)/scripts/module-common.lds
GCC_PLUGINS_CFLAGS :=
CLANG_FLAGS :=
# Read KERNELRELEASE from include/config/kernel.release (if it exists)
KERNELRELEASE = $(shell cat include/config/kernel.release 2> /dev/null)
@ -422,7 +418,8 @@ export MAKE AWK GENKSYMS INSTALLKERNEL PERL PYTHON UTS_MACHINE
export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_KASAN CFLAGS_UBSAN
export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE
export CFLAGS_KASAN CFLAGS_KASAN_NOSANITIZE CFLAGS_UBSAN
export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE
export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE
export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL
@ -533,6 +530,27 @@ ifneq ($(filter install,$(MAKECMDGOALS)),)
endif
endif
ifeq ($(cc-name),clang)
ifneq ($(CROSS_COMPILE),)
CLANG_TRIPLE ?= $(CROSS_COMPILE)
CLANG_FLAGS += --target=$(notdir $(CLANG_TRIPLE:%-=%))
ifeq ($(shell $(srctree)/scripts/clang-android.sh $(CC) $(CLANG_FLAGS)), y)
$(error "Clang with Android --target detected. Did you specify CLANG_TRIPLE?")
endif
GCC_TOOLCHAIN_DIR := $(dir $(shell which $(CROSS_COMPILE)elfedit))
CLANG_FLAGS += --prefix=$(GCC_TOOLCHAIN_DIR)
GCC_TOOLCHAIN := $(realpath $(GCC_TOOLCHAIN_DIR)/..)
endif
ifneq ($(GCC_TOOLCHAIN),)
CLANG_FLAGS += --gcc-toolchain=$(GCC_TOOLCHAIN)
endif
CLANG_FLAGS += -no-integrated-as
CLANG_FLAGS += -Werror=unknown-warning-option
KBUILD_CFLAGS += $(CLANG_FLAGS)
KBUILD_AFLAGS += $(CLANG_FLAGS)
endif
ifeq ($(mixed-targets),1)
# ===========================================================================
# We're called with mixed targets (*config and build targets).
@ -674,6 +692,7 @@ KBUILD_CFLAGS += $(call cc-disable-warning,frame-address,)
KBUILD_CFLAGS += $(call cc-disable-warning, format-truncation)
KBUILD_CFLAGS += $(call cc-disable-warning, format-overflow)
KBUILD_CFLAGS += $(call cc-disable-warning, int-in-bool-context)
KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
KBUILD_CFLAGS += $(call cc-disable-warning, attribute-alias)
ifdef CONFIG_LD_DEAD_CODE_DATA_ELIMINATION
@ -729,8 +748,7 @@ export DISABLE_CFI
endif
ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
KBUILD_CFLAGS += $(call cc-option,-Oz,-Os)
KBUILD_CFLAGS += $(call cc-disable-warning,maybe-uninitialized,)
KBUILD_CFLAGS += -Os $(call cc-disable-warning,maybe-uninitialized,)
else
ifdef CONFIG_PROFILE_ALL_BRANCHES
KBUILD_CFLAGS += -O2 $(call cc-disable-warning,maybe-uninitialized,)
@ -790,21 +808,9 @@ endif
KBUILD_CFLAGS += $(stackp-flag)
ifeq ($(cc-name),clang)
ifneq ($(CROSS_COMPILE),)
CLANG_TRIPLE ?= $(CROSS_COMPILE)
CLANG_TARGET := --target=$(notdir $(CLANG_TRIPLE:%-=%))
GCC_TOOLCHAIN := $(realpath $(dir $(shell which $(LD)))/..)
endif
ifneq ($(GCC_TOOLCHAIN),)
CLANG_GCC_TC := --gcc-toolchain=$(GCC_TOOLCHAIN)
endif
KBUILD_CFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
KBUILD_AFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,)
KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable)
KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier)
KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
KBUILD_CFLAGS += $(call cc-disable-warning, duplicate-decl-specifier)
# Quiet clang warning: comparison of unsigned expression < 0 is always false
KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare)
@ -813,16 +819,14 @@ KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare)
# See modpost pattern 2
KBUILD_CFLAGS += $(call cc-option, -mno-global-merge,)
KBUILD_CFLAGS += $(call cc-option, -fcatch-undefined-behavior)
KBUILD_CFLAGS += $(call cc-option, -no-integrated-as)
KBUILD_AFLAGS += $(call cc-option, -no-integrated-as)
else
# These warnings generated too much noise in a regular build.
# Use make W=1 to enable them (see scripts/Makefile.build)
# Use make W=1 to enable them (see scripts/Makefile.extrawarn)
KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable)
KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable)
endif
KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable)
ifdef CONFIG_FRAME_POINTER
KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls
else
@ -894,6 +898,9 @@ KBUILD_CFLAGS += $(call cc-option,-Wdeclaration-after-statement,)
# disable pointer signed / unsigned warnings in gcc 4.0
KBUILD_CFLAGS += $(call cc-disable-warning, pointer-sign)
# disable stringop warnings in gcc 8+
KBUILD_CFLAGS += $(call cc-disable-warning, stringop-truncation)
# disable invalid "can't wrap" optimizations for signed / pointers
KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow)
@ -924,6 +931,18 @@ KBUILD_CFLAGS += $(call cc-option,-Werror=date-time)
# enforce correct pointer usage
KBUILD_CFLAGS += $(call cc-option,-Werror=incompatible-pointer-types)
# Require designated initializers for all marked structures
KBUILD_CFLAGS += $(call cc-option,-Werror=designated-init)
# change __FILE__ to the relative path from the srctree
KBUILD_CFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=)
# ensure -fcf-protection is disabled when using retpoline as it is
# incompatible with -mindirect-branch=thunk-extern
ifdef CONFIG_RETPOLINE
KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none)
endif
# use the deterministic mode of AR if available
KBUILD_ARFLAGS := $(call ar-option,D)
@ -1628,9 +1647,6 @@ else # KBUILD_EXTMOD
# We are always building modules
KBUILD_MODULES := 1
PHONY += crmodverdir
crmodverdir:
$(cmd_crmodverdir)
PHONY += $(objtree)/Module.symvers
$(objtree)/Module.symvers:
@ -1642,7 +1658,7 @@ $(objtree)/Module.symvers:
module-dirs := $(addprefix _module_,$(KBUILD_EXTMOD))
PHONY += $(module-dirs) modules
$(module-dirs): crmodverdir $(objtree)/Module.symvers
$(module-dirs): prepare $(objtree)/Module.symvers
$(Q)$(MAKE) $(build)=$(patsubst _module_%,%,$@)
modules: $(module-dirs)
@ -1683,7 +1699,8 @@ help:
# Dummies...
PHONY += prepare scripts
prepare: ;
prepare:
$(cmd_crmodverdir)
scripts: ;
endif # KBUILD_EXTMOD
@ -1809,17 +1826,14 @@ endif
# Modules
/: prepare scripts FORCE
$(cmd_crmodverdir)
$(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \
$(build)=$(build-dir)
# Make sure the latest headers are built for Documentation
Documentation/ samples/: headers_install
%/: prepare scripts FORCE
$(cmd_crmodverdir)
$(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \
$(build)=$(build-dir)
%.ko: prepare scripts FORCE
$(cmd_crmodverdir)
$(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \
$(build)=$(build-dir) $(@:.ko=.o)
$(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modpost

View file

@ -5,6 +5,9 @@
config KEXEC_CORE
bool
config HOTPLUG_SMT
bool
config OPROFILE
tristate "OProfile system profiling"
depends on PROFILING
@ -513,6 +516,7 @@ config LTO_CLANG
bool "Use clang Link Time Optimization (LTO) (EXPERIMENTAL)"
depends on ARCH_SUPPORTS_LTO_CLANG
depends on !FTRACE_MCOUNT_RECORD || HAVE_C_RECORDMCOUNT
depends on !KASAN
select LTO
select THIN_ARCHIVES
select LD_DEAD_CODE_DATA_ELIMINATION

View file

@ -55,15 +55,15 @@
#elif defined(CONFIG_ALPHA_DP264) || \
defined(CONFIG_ALPHA_LYNX) || \
defined(CONFIG_ALPHA_SHARK) || \
defined(CONFIG_ALPHA_EIGER)
defined(CONFIG_ALPHA_SHARK)
# define NR_IRQS 64
#elif defined(CONFIG_ALPHA_TITAN)
#define NR_IRQS 80
#elif defined(CONFIG_ALPHA_RAWHIDE) || \
defined(CONFIG_ALPHA_TAKARA)
defined(CONFIG_ALPHA_TAKARA) || \
defined(CONFIG_ALPHA_EIGER)
# define NR_IRQS 128
#elif defined(CONFIG_ALPHA_WILDFIRE)

View file

@ -72,9 +72,15 @@
})
#define user_termios_to_kernel_termios(k, u) \
copy_from_user(k, u, sizeof(struct termios))
copy_from_user(k, u, sizeof(struct termios2))
#define kernel_termios_to_user_termios(u, k) \
copy_to_user(u, k, sizeof(struct termios2))
#define user_termios_to_kernel_termios_1(k, u) \
copy_from_user(k, u, sizeof(struct termios))
#define kernel_termios_to_user_termios_1(u, k) \
copy_to_user(u, k, sizeof(struct termios))
#endif /* _ALPHA_TERMIOS_H */

View file

@ -31,6 +31,11 @@
#define TCXONC _IO('t', 30)
#define TCFLSH _IO('t', 31)
#define TCGETS2 _IOR('T', 42, struct termios2)
#define TCSETS2 _IOW('T', 43, struct termios2)
#define TCSETSW2 _IOW('T', 44, struct termios2)
#define TCSETSF2 _IOW('T', 45, struct termios2)
#define TIOCSWINSZ _IOW('t', 103, struct winsize)
#define TIOCGWINSZ _IOR('t', 104, struct winsize)
#define TIOCSTART _IO('t', 110) /* start output, like ^Q */

View file

@ -25,6 +25,19 @@ struct termios {
speed_t c_ospeed; /* output speed */
};
/* Alpha has identical termios and termios2 */
struct termios2 {
tcflag_t c_iflag; /* input mode flags */
tcflag_t c_oflag; /* output mode flags */
tcflag_t c_cflag; /* control mode flags */
tcflag_t c_lflag; /* local mode flags */
cc_t c_cc[NCCS]; /* control characters */
cc_t c_line; /* line discipline (== c_cc[19]) */
speed_t c_ispeed; /* input speed */
speed_t c_ospeed; /* output speed */
};
/* Alpha has matching termios and ktermios */
struct ktermios {
@ -147,6 +160,7 @@ struct ktermios {
#define B3000000 00034
#define B3500000 00035
#define B4000000 00036
#define BOTHER 00037
#define CSIZE 00001400
#define CS5 00000000
@ -164,6 +178,9 @@ struct ktermios {
#define CMSPAR 010000000000 /* mark or space (stick) parity */
#define CRTSCTS 020000000000 /* flow control */
#define CIBAUD 07600000
#define IBSHIFT 16
/* c_lflag bits */
#define ISIG 0x00000080
#define ICANON 0x00000100

View file

@ -526,24 +526,19 @@ SYSCALL_DEFINE4(osf_mount, unsigned long, typenr, const char __user *, path,
SYSCALL_DEFINE1(osf_utsname, char __user *, name)
{
int error;
char tmp[5 * 32];
down_read(&uts_sem);
error = -EFAULT;
if (copy_to_user(name + 0, utsname()->sysname, 32))
goto out;
if (copy_to_user(name + 32, utsname()->nodename, 32))
goto out;
if (copy_to_user(name + 64, utsname()->release, 32))
goto out;
if (copy_to_user(name + 96, utsname()->version, 32))
goto out;
if (copy_to_user(name + 128, utsname()->machine, 32))
goto out;
memcpy(tmp + 0 * 32, utsname()->sysname, 32);
memcpy(tmp + 1 * 32, utsname()->nodename, 32);
memcpy(tmp + 2 * 32, utsname()->release, 32);
memcpy(tmp + 3 * 32, utsname()->version, 32);
memcpy(tmp + 4 * 32, utsname()->machine, 32);
up_read(&uts_sem);
error = 0;
out:
up_read(&uts_sem);
return error;
if (copy_to_user(name, tmp, sizeof(tmp)))
return -EFAULT;
return 0;
}
SYSCALL_DEFINE0(getpagesize)
@ -561,24 +556,22 @@ SYSCALL_DEFINE0(getdtablesize)
*/
SYSCALL_DEFINE2(osf_getdomainname, char __user *, name, int, namelen)
{
unsigned len;
int i;
int len, err = 0;
char *kname;
char tmp[32];
if (!access_ok(VERIFY_WRITE, name, namelen))
return -EFAULT;
len = namelen;
if (len > 32)
len = 32;
if (namelen < 0 || namelen > 32)
namelen = 32;
down_read(&uts_sem);
for (i = 0; i < len; ++i) {
__put_user(utsname()->domainname[i], name + i);
if (utsname()->domainname[i] == '\0')
break;
}
kname = utsname()->domainname;
len = strnlen(kname, namelen);
len = min(len + 1, namelen);
memcpy(tmp, kname, len);
up_read(&uts_sem);
if (copy_to_user(name, tmp, len))
return -EFAULT;
return 0;
}
@ -741,13 +734,14 @@ SYSCALL_DEFINE3(osf_sysinfo, int, command, char __user *, buf, long, count)
};
unsigned long offset;
const char *res;
long len, err = -EINVAL;
long len;
char tmp[__NEW_UTS_LEN + 1];
offset = command-1;
if (offset >= ARRAY_SIZE(sysinfo_table)) {
/* Digital UNIX has a few unpublished interfaces here */
printk("sysinfo(%d)", command);
goto out;
return -EINVAL;
}
down_read(&uts_sem);
@ -755,13 +749,11 @@ SYSCALL_DEFINE3(osf_sysinfo, int, command, char __user *, buf, long, count)
len = strlen(res)+1;
if ((unsigned long)len > (unsigned long)count)
len = count;
if (copy_to_user(buf, res, len))
err = -EFAULT;
else
err = 0;
memcpy(tmp, res, len);
up_read(&uts_sem);
out:
return err;
if (copy_to_user(buf, tmp, len))
return -EFAULT;
return 0;
}
SYSCALL_DEFINE5(osf_getsysinfo, unsigned long, op, void __user *, buffer,

View file

@ -77,7 +77,7 @@ __load_new_mm_context(struct mm_struct *next_mm)
/* Macro for exception fixup code to access integer registers. */
#define dpf_reg(r) \
(((unsigned long *)regs)[(r) <= 8 ? (r) : (r) <= 15 ? (r)-16 : \
(r) <= 18 ? (r)+8 : (r)-10])
(r) <= 18 ? (r)+10 : (r)-10])
asmlinkage void
do_page_fault(unsigned long address, unsigned long mmcsr,

View file

@ -23,7 +23,7 @@ config ARC
select GENERIC_SMP_IDLE_THREAD
select HAVE_ARCH_KGDB
select HAVE_ARCH_TRACEHOOK
select HAVE_FUTEX_CMPXCHG
select HAVE_FUTEX_CMPXCHG if FUTEX
select HAVE_IOREMAP_PROT
select HAVE_KPROBES
select HAVE_KRETPROBES

View file

@ -1,5 +1,4 @@
CONFIG_DEFAULT_HOSTNAME="ARCLinux"
# CONFIG_SWAP is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
# CONFIG_CROSS_MEMORY_ATTACH is not set
@ -98,6 +97,7 @@ CONFIG_VFAT_FS=y
CONFIG_NTFS_FS=y
CONFIG_TMPFS=y
CONFIG_NFS_FS=y
CONFIG_NFS_V3_ACL=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
# CONFIG_ENABLE_WARN_DEPRECATED is not set

View file

@ -1,5 +1,4 @@
CONFIG_DEFAULT_HOSTNAME="ARCLinux"
# CONFIG_SWAP is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
# CONFIG_CROSS_MEMORY_ATTACH is not set
@ -98,6 +97,7 @@ CONFIG_VFAT_FS=y
CONFIG_NTFS_FS=y
CONFIG_TMPFS=y
CONFIG_NFS_FS=y
CONFIG_NFS_V3_ACL=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
# CONFIG_ENABLE_WARN_DEPRECATED is not set

View file

@ -1,5 +1,4 @@
CONFIG_DEFAULT_HOSTNAME="ARCLinux"
# CONFIG_SWAP is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
# CONFIG_CROSS_MEMORY_ATTACH is not set
@ -99,6 +98,7 @@ CONFIG_VFAT_FS=y
CONFIG_NTFS_FS=y
CONFIG_TMPFS=y
CONFIG_NFS_FS=y
CONFIG_NFS_V3_ACL=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
# CONFIG_ENABLE_WARN_DEPRECATED is not set

View file

@ -76,6 +76,7 @@ CONFIG_PROC_KCORE=y
CONFIG_TMPFS=y
# CONFIG_MISC_FILESYSTEMS is not set
CONFIG_NFS_FS=y
CONFIG_NFS_V3_ACL=y
CONFIG_ROOT_NFS=y
CONFIG_DEBUG_INFO=y
# CONFIG_ENABLE_WARN_DEPRECATED is not set

View file

@ -71,5 +71,6 @@ CONFIG_EXT2_FS_XATTR=y
CONFIG_TMPFS=y
# CONFIG_MISC_FILESYSTEMS is not set
CONFIG_NFS_FS=y
CONFIG_NFS_V3_ACL=y
# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set

View file

@ -69,5 +69,6 @@ CONFIG_EXT2_FS_XATTR=y
CONFIG_TMPFS=y
# CONFIG_MISC_FILESYSTEMS is not set
CONFIG_NFS_FS=y
CONFIG_NFS_V3_ACL=y
# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set

View file

@ -80,6 +80,7 @@ CONFIG_EXT2_FS_XATTR=y
CONFIG_TMPFS=y
# CONFIG_MISC_FILESYSTEMS is not set
CONFIG_NFS_FS=y
CONFIG_NFS_V3_ACL=y
# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_FTRACE=y

View file

@ -88,6 +88,7 @@ CONFIG_NTFS_FS=y
CONFIG_TMPFS=y
CONFIG_JFFS2_FS=y
CONFIG_NFS_FS=y
CONFIG_NFS_V3_ACL=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
# CONFIG_ENABLE_WARN_DEPRECATED is not set

View file

@ -87,6 +87,7 @@ CONFIG_NTFS_FS=y
CONFIG_TMPFS=y
CONFIG_JFFS2_FS=y
CONFIG_NFS_FS=y
CONFIG_NFS_V3_ACL=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
# CONFIG_ENABLE_WARN_DEPRECATED is not set

View file

@ -84,7 +84,7 @@ static inline int atomic_fetch_##op(int i, atomic_t *v) \
"1: llock %[orig], [%[ctr]] \n" \
" " #asm_op " %[val], %[orig], %[i] \n" \
" scond %[val], [%[ctr]] \n" \
" \n" \
" bnz 1b \n" \
: [val] "=&r" (val), \
[orig] "=&r" (orig) \
: [ctr] "r" (&v->counter), \

View file

@ -340,7 +340,7 @@ static inline __attribute__ ((const)) int __fls(unsigned long x)
/*
* __ffs: Similar to ffs, but zero based (0-31)
*/
static inline __attribute__ ((const)) int __ffs(unsigned long word)
static inline __attribute__ ((const)) unsigned long __ffs(unsigned long word)
{
if (!word)
return word;
@ -400,9 +400,9 @@ static inline __attribute__ ((const)) int ffs(unsigned long x)
/*
* __ffs: Similar to ffs, but zero based (0-31)
*/
static inline __attribute__ ((const)) int __ffs(unsigned long x)
static inline __attribute__ ((const)) unsigned long __ffs(unsigned long x)
{
int n;
unsigned long n;
asm volatile(
" ffs.f %0, %1 \n" /* 0:31; 31(Z) if src 0 */

View file

@ -23,7 +23,8 @@ void die(const char *str, struct pt_regs *regs, unsigned long address);
#define BUG() do { \
pr_warn("BUG: failure at %s:%d/%s()!\n", __FILE__, __LINE__, __func__); \
dump_stack(); \
barrier_before_unreachable(); \
__builtin_trap(); \
} while (0)
#define HAVE_ARCH_BUG

View file

@ -49,6 +49,17 @@
#define ARCH_DMA_MINALIGN L1_CACHE_BYTES
/*
* Make sure slab-allocated buffers are 64-bit aligned when atomic64_t uses
* ARCv2 64-bit atomics (LLOCKD/SCONDD). This guarantess runtime 64-bit
* alignment for any atomic64_t embedded in buffer.
* Default ARCH_SLAB_MINALIGN is __alignof__(long long) which has a relaxed
* value of 4 (and not 8) in ARC ABI.
*/
#if defined(CONFIG_ARC_HAS_LL64) && defined(CONFIG_ARC_HAS_LLSC)
#define ARCH_SLAB_MINALIGN 8
#endif
extern void arc_cache_init(void);
extern char *arc_cache_mumbojumbo(int cpu_id, char *buf, int len);
extern void read_decode_cache_bcr(void);

View file

@ -92,8 +92,11 @@ __cmpxchg(volatile void *ptr, unsigned long expected, unsigned long new)
#endif /* CONFIG_ARC_HAS_LLSC */
#define cmpxchg(ptr, o, n) ((typeof(*(ptr)))__cmpxchg((ptr), \
(unsigned long)(o), (unsigned long)(n)))
#define cmpxchg(ptr, o, n) ({ \
(typeof(*(ptr)))__cmpxchg((ptr), \
(unsigned long)(o), \
(unsigned long)(n)); \
})
/*
* atomic_cmpxchg is same as cmpxchg
@ -198,8 +201,11 @@ static inline unsigned long __xchg(unsigned long val, volatile void *ptr,
return __xchg_bad_pointer();
}
#define xchg(ptr, with) ((typeof(*(ptr)))__xchg((unsigned long)(with), (ptr), \
sizeof(*(ptr))))
#define xchg(ptr, with) ({ \
(typeof(*(ptr)))__xchg((unsigned long)(with), \
(ptr), \
sizeof(*(ptr))); \
})
#endif /* CONFIG_ARC_PLAT_EZNPS */

View file

@ -17,8 +17,11 @@
#ifndef __ASM_ARC_UDELAY_H
#define __ASM_ARC_UDELAY_H
#include <asm-generic/types.h>
#include <asm/param.h> /* HZ */
extern unsigned long loops_per_jiffy;
static inline void __delay(unsigned long loops)
{
__asm__ __volatile__(

View file

@ -12,6 +12,7 @@
#include <linux/types.h>
#include <asm/byteorder.h>
#include <asm/page.h>
#include <asm/unaligned.h>
#ifdef CONFIG_ISA_ARCV2
#include <asm/barrier.h>
@ -94,6 +95,42 @@ static inline u32 __raw_readl(const volatile void __iomem *addr)
return w;
}
/*
* {read,write}s{b,w,l}() repeatedly access the same IO address in
* native endianness in 8-, 16-, 32-bit chunks {into,from} memory,
* @count times
*/
#define __raw_readsx(t,f) \
static inline void __raw_reads##f(const volatile void __iomem *addr, \
void *ptr, unsigned int count) \
{ \
bool is_aligned = ((unsigned long)ptr % ((t) / 8)) == 0; \
u##t *buf = ptr; \
\
if (!count) \
return; \
\
/* Some ARC CPU's don't support unaligned accesses */ \
if (is_aligned) { \
do { \
u##t x = __raw_read##f(addr); \
*buf++ = x; \
} while (--count); \
} else { \
do { \
u##t x = __raw_read##f(addr); \
put_unaligned(x, buf++); \
} while (--count); \
} \
}
#define __raw_readsb __raw_readsb
__raw_readsx(8, b)
#define __raw_readsw __raw_readsw
__raw_readsx(16, w)
#define __raw_readsl __raw_readsl
__raw_readsx(32, l)
#define __raw_writeb __raw_writeb
static inline void __raw_writeb(u8 b, volatile void __iomem *addr)
{
@ -126,6 +163,35 @@ static inline void __raw_writel(u32 w, volatile void __iomem *addr)
}
#define __raw_writesx(t,f) \
static inline void __raw_writes##f(volatile void __iomem *addr, \
const void *ptr, unsigned int count) \
{ \
bool is_aligned = ((unsigned long)ptr % ((t) / 8)) == 0; \
const u##t *buf = ptr; \
\
if (!count) \
return; \
\
/* Some ARC CPU's don't support unaligned accesses */ \
if (is_aligned) { \
do { \
__raw_write##f(*buf++, addr); \
} while (--count); \
} else { \
do { \
__raw_write##f(get_unaligned(buf++), addr); \
} while (--count); \
} \
}
#define __raw_writesb __raw_writesb
__raw_writesx(8, b)
#define __raw_writesw __raw_writesw
__raw_writesx(16, w)
#define __raw_writesl __raw_writesl
__raw_writesx(32, l)
/*
* MMIO can also get buffered/optimized in micro-arch, so barriers needed
* Based on ARM model for the typical use case
@ -141,10 +207,16 @@ static inline void __raw_writel(u32 w, volatile void __iomem *addr)
#define readb(c) ({ u8 __v = readb_relaxed(c); __iormb(); __v; })
#define readw(c) ({ u16 __v = readw_relaxed(c); __iormb(); __v; })
#define readl(c) ({ u32 __v = readl_relaxed(c); __iormb(); __v; })
#define readsb(p,d,l) ({ __raw_readsb(p,d,l); __iormb(); })
#define readsw(p,d,l) ({ __raw_readsw(p,d,l); __iormb(); })
#define readsl(p,d,l) ({ __raw_readsl(p,d,l); __iormb(); })
#define writeb(v,c) ({ __iowmb(); writeb_relaxed(v,c); })
#define writew(v,c) ({ __iowmb(); writew_relaxed(v,c); })
#define writel(v,c) ({ __iowmb(); writel_relaxed(v,c); })
#define writesb(p,d,l) ({ __iowmb(); __raw_writesb(p,d,l); })
#define writesw(p,d,l) ({ __iowmb(); __raw_writesw(p,d,l); })
#define writesl(p,d,l) ({ __iowmb(); __raw_writesl(p,d,l); })
/*
* Relaxed API for drivers which can handle barrier ordering themselves

View file

@ -34,9 +34,7 @@ struct machine_desc {
const char *name;
const char **dt_compat;
void (*init_early)(void);
#ifdef CONFIG_SMP
void (*init_per_cpu)(unsigned int);
#endif
void (*init_machine)(void);
void (*init_late)(void);

View file

@ -103,7 +103,8 @@ static const char * const arc_pmu_ev_hw_map[] = {
/* counts condition */
[PERF_COUNT_HW_INSTRUCTIONS] = "iall",
[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = "ijmp", /* Excludes ZOL jumps */
/* All jump instructions that are taken */
[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = "ijmptak",
[PERF_COUNT_ARC_BPOK] = "bpok", /* NP-NT, PT-T, PNT-NT */
#ifdef CONFIG_ISA_ARCV2
[PERF_COUNT_HW_BRANCH_MISSES] = "bpmp",

View file

@ -209,7 +209,7 @@ __arc_copy_from_user(void *to, const void __user *from, unsigned long n)
*/
"=&r" (tmp), "+r" (to), "+r" (from)
:
: "lp_count", "lp_start", "lp_end", "memory");
: "lp_count", "memory");
return n;
}
@ -438,7 +438,7 @@ __arc_copy_to_user(void __user *to, const void *from, unsigned long n)
*/
"=&r" (tmp), "+r" (to), "+r" (from)
:
: "lp_count", "lp_start", "lp_end", "memory");
: "lp_count", "memory");
return n;
}
@ -658,7 +658,7 @@ static inline unsigned long __arc_clear_user(void __user *to, unsigned long n)
" .previous \n"
: "+r"(d_char), "+r"(res)
: "i"(0)
: "lp_count", "lp_start", "lp_end", "memory");
: "lp_count", "memory");
return res;
}
@ -691,7 +691,7 @@ __arc_strncpy_from_user(char *dst, const char __user *src, long count)
" .previous \n"
: "+r"(res), "+r"(dst), "+r"(src), "=r"(val)
: "g"(-EFAULT), "r"(count)
: "lp_count", "lp_start", "lp_end", "memory");
: "lp_count", "memory");
return res;
}

View file

@ -17,6 +17,7 @@
#include <asm/entry.h>
#include <asm/arcregs.h>
#include <asm/cache.h>
#include <asm/irqflags.h>
.macro CPU_EARLY_SETUP
@ -47,6 +48,15 @@
sr r5, [ARC_REG_DC_CTRL]
1:
#ifdef CONFIG_ISA_ARCV2
; Unaligned access is disabled at reset, so re-enable early as
; gcc 7.3.1 (ARC GNU 2018.03) onwards generates unaligned access
; by default
lr r5, [status32]
bset r5, r5, STATUS_AD_BIT
kflag r5
#endif
.endm
.section .init.text, "ax",@progbits
@ -93,10 +103,11 @@ ENTRY(stext)
#ifdef CONFIG_ARC_UBOOT_SUPPORT
; Uboot - kernel ABI
; r0 = [0] No uboot interaction, [1] cmdline in r2, [2] DTB in r2
; r1 = magic number (board identity, unused as of now
; r1 = magic number (always zero as of now)
; r2 = pointer to uboot provided cmdline or external DTB in mem
; These are handled later in setup_arch()
; These are handled later in handle_uboot_args()
st r0, [@uboot_tag]
st r1, [@uboot_magic]
st r2, [@uboot_arg]
#endif

View file

@ -31,10 +31,10 @@ void __init init_IRQ(void)
/* a SMP H/w block could do IPI IRQ request here */
if (plat_smp_ops.init_per_cpu)
plat_smp_ops.init_per_cpu(smp_processor_id());
#endif
if (machine_desc->init_per_cpu)
machine_desc->init_per_cpu(smp_processor_id());
#endif
}
/*

View file

@ -488,8 +488,8 @@ static int arc_pmu_device_probe(struct platform_device *pdev)
/* loop thru all available h/w condition indexes */
for (j = 0; j < cc_bcr.c; j++) {
write_aux_reg(ARC_REG_CC_INDEX, j);
cc_name.indiv.word0 = read_aux_reg(ARC_REG_CC_NAME0);
cc_name.indiv.word1 = read_aux_reg(ARC_REG_CC_NAME1);
cc_name.indiv.word0 = le32_to_cpu(read_aux_reg(ARC_REG_CC_NAME0));
cc_name.indiv.word1 = le32_to_cpu(read_aux_reg(ARC_REG_CC_NAME1));
/* See if it has been mapped to a perf event_id */
for (i = 0; i < ARRAY_SIZE(arc_pmu_ev_hw_map); i++) {

View file

@ -44,7 +44,8 @@ SYSCALL_DEFINE0(arc_gettls)
SYSCALL_DEFINE3(arc_usr_cmpxchg, int *, uaddr, int, expected, int, new)
{
struct pt_regs *regs = current_pt_regs();
int uval = -EFAULT;
u32 uval;
int ret;
/*
* This is only for old cores lacking LLOCK/SCOND, which by defintion
@ -57,23 +58,47 @@ SYSCALL_DEFINE3(arc_usr_cmpxchg, int *, uaddr, int, expected, int, new)
/* Z indicates to userspace if operation succeded */
regs->status32 &= ~STATUS_Z_MASK;
if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
return -EFAULT;
ret = access_ok(VERIFY_WRITE, uaddr, sizeof(*uaddr));
if (!ret)
goto fail;
again:
preempt_disable();
if (__get_user(uval, uaddr))
goto done;
ret = __get_user(uval, uaddr);
if (ret)
goto fault;
if (uval == expected) {
if (!__put_user(new, uaddr))
regs->status32 |= STATUS_Z_MASK;
}
if (uval != expected)
goto out;
done:
ret = __put_user(new, uaddr);
if (ret)
goto fault;
regs->status32 |= STATUS_Z_MASK;
out:
preempt_enable();
return uval;
fault:
preempt_enable();
return uval;
if (unlikely(ret != -EFAULT))
goto fail;
down_read(&current->mm->mmap_sem);
ret = fixup_user_fault(current, current->mm, (unsigned long) uaddr,
FAULT_FLAG_WRITE, NULL);
up_read(&current->mm->mmap_sem);
if (likely(!ret))
goto again;
fail:
force_sig(SIGSEGV, current);
return ret;
}
void arch_cpu_idle(void)
@ -188,6 +213,26 @@ int copy_thread(unsigned long clone_flags,
task_thread_info(current)->thr_ptr;
}
/*
* setup usermode thread pointer #1:
* when child is picked by scheduler, __switch_to() uses @c_callee to
* populate usermode callee regs: this works (despite being in a kernel
* function) since special return path for child @ret_from_fork()
* ensures those regs are not clobbered all the way to RTIE to usermode
*/
c_callee->r25 = task_thread_info(p)->thr_ptr;
#ifdef CONFIG_ARC_CURR_IN_REG
/*
* setup usermode thread pointer #2:
* however for this special use of r25 in kernel, __switch_to() sets
* r25 for kernel needs and only in the final return path is usermode
* r25 setup, from pt_regs->user_r25. So set that up as well
*/
c_regs->user_r25 = c_callee->r25;
#endif
return 0;
}

View file

@ -32,6 +32,7 @@ unsigned int intr_to_DE_cnt;
/* Part of U-boot ABI: see head.S */
int __initdata uboot_tag;
int __initdata uboot_magic;
char __initdata *uboot_arg;
const struct machine_desc *machine_desc;
@ -381,43 +382,87 @@ void setup_processor(void)
arc_chk_core_config();
}
static inline int is_kernel(unsigned long addr)
static inline bool uboot_arg_invalid(unsigned long addr)
{
if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
return 1;
return 0;
/*
* Check that it is a untranslated address (although MMU is not enabled
* yet, it being a high address ensures this is not by fluke)
*/
if (addr < PAGE_OFFSET)
return true;
/* Check that address doesn't clobber resident kernel image */
return addr >= (unsigned long)_stext && addr <= (unsigned long)_end;
}
#define IGNORE_ARGS "Ignore U-boot args: "
/* uboot_tag values for U-boot - kernel ABI revision 0; see head.S */
#define UBOOT_TAG_NONE 0
#define UBOOT_TAG_CMDLINE 1
#define UBOOT_TAG_DTB 2
/* We always pass 0 as magic from U-boot */
#define UBOOT_MAGIC_VALUE 0
void __init handle_uboot_args(void)
{
bool use_embedded_dtb = true;
bool append_cmdline = false;
#ifdef CONFIG_ARC_UBOOT_SUPPORT
/* check that we know this tag */
if (uboot_tag != UBOOT_TAG_NONE &&
uboot_tag != UBOOT_TAG_CMDLINE &&
uboot_tag != UBOOT_TAG_DTB) {
pr_warn(IGNORE_ARGS "invalid uboot tag: '%08x'\n", uboot_tag);
goto ignore_uboot_args;
}
if (uboot_magic != UBOOT_MAGIC_VALUE) {
pr_warn(IGNORE_ARGS "non zero uboot magic\n");
goto ignore_uboot_args;
}
if (uboot_tag != UBOOT_TAG_NONE &&
uboot_arg_invalid((unsigned long)uboot_arg)) {
pr_warn(IGNORE_ARGS "invalid uboot arg: '%px'\n", uboot_arg);
goto ignore_uboot_args;
}
/* see if U-boot passed an external Device Tree blob */
if (uboot_tag == UBOOT_TAG_DTB) {
machine_desc = setup_machine_fdt((void *)uboot_arg);
/* external Device Tree blob is invalid - use embedded one */
use_embedded_dtb = !machine_desc;
}
if (uboot_tag == UBOOT_TAG_CMDLINE)
append_cmdline = true;
ignore_uboot_args:
#endif
if (use_embedded_dtb) {
machine_desc = setup_machine_fdt(__dtb_start);
if (!machine_desc)
panic("Embedded DT invalid\n");
}
/*
* NOTE: @boot_command_line is populated by setup_machine_fdt() so this
* append processing can only happen after.
*/
if (append_cmdline) {
/* Ensure a whitespace between the 2 cmdlines */
strlcat(boot_command_line, " ", COMMAND_LINE_SIZE);
strlcat(boot_command_line, uboot_arg, COMMAND_LINE_SIZE);
}
}
void __init setup_arch(char **cmdline_p)
{
#ifdef CONFIG_ARC_UBOOT_SUPPORT
/* make sure that uboot passed pointer to cmdline/dtb is valid */
if (uboot_tag && is_kernel((unsigned long)uboot_arg))
panic("Invalid uboot arg\n");
/* See if u-boot passed an external Device Tree blob */
machine_desc = setup_machine_fdt(uboot_arg); /* uboot_tag == 2 */
if (!machine_desc)
#endif
{
/* No, so try the embedded one */
machine_desc = setup_machine_fdt(__dtb_start);
if (!machine_desc)
panic("Embedded DT invalid\n");
/*
* If we are here, it is established that @uboot_arg didn't
* point to DT blob. Instead if u-boot says it is cmdline,
* append to embedded DT cmdline.
* setup_machine_fdt() would have populated @boot_command_line
*/
if (uboot_tag == 1) {
/* Ensure a whitespace between the 2 cmdlines */
strlcat(boot_command_line, " ", COMMAND_LINE_SIZE);
strlcat(boot_command_line, uboot_arg,
COMMAND_LINE_SIZE);
}
}
handle_uboot_args();
/* Save unparsed command line copy for /proc/cmdline */
*cmdline_p = boot_command_line;

View file

@ -155,3 +155,12 @@ void do_insterror_or_kprobe(unsigned long address, struct pt_regs *regs)
insterror_is_error(address, regs);
}
/*
* abort() call generated by older gcc for __builtin_trap()
*/
void abort(void)
{
__asm__ __volatile__("trap_s 5\n");
}
EXPORT_SYMBOL(abort);

View file

@ -185,11 +185,6 @@ static void *__init unw_hdr_alloc_early(unsigned long sz)
MAX_DMA_ADDRESS);
}
static void *unw_hdr_alloc(unsigned long sz)
{
return kmalloc(sz, GFP_KERNEL);
}
static void init_unwind_table(struct unwind_table *table, const char *name,
const void *core_start, unsigned long core_size,
const void *init_start, unsigned long init_size,
@ -370,6 +365,10 @@ ret_err:
}
#ifdef CONFIG_MODULES
static void *unw_hdr_alloc(unsigned long sz)
{
return kmalloc(sz, GFP_KERNEL);
}
static struct unwind_table *last_table;

View file

@ -25,15 +25,11 @@
#endif
#ifdef CONFIG_ARC_HAS_LL64
# define PREFETCH_READ(RX) prefetch [RX, 56]
# define PREFETCH_WRITE(RX) prefetchw [RX, 64]
# define LOADX(DST,RX) ldd.ab DST, [RX, 8]
# define STOREX(SRC,RX) std.ab SRC, [RX, 8]
# define ZOLSHFT 5
# define ZOLAND 0x1F
#else
# define PREFETCH_READ(RX) prefetch [RX, 28]
# define PREFETCH_WRITE(RX) prefetchw [RX, 32]
# define LOADX(DST,RX) ld.ab DST, [RX, 4]
# define STOREX(SRC,RX) st.ab SRC, [RX, 4]
# define ZOLSHFT 4
@ -41,8 +37,6 @@
#endif
ENTRY_CFI(memcpy)
prefetch [r1] ; Prefetch the read location
prefetchw [r0] ; Prefetch the write location
mov.f 0, r2
;;; if size is zero
jz.d [blink]
@ -72,8 +66,6 @@ ENTRY_CFI(memcpy)
lpnz @.Lcopy32_64bytes
;; LOOP START
LOADX (r6, r1)
PREFETCH_READ (r1)
PREFETCH_WRITE (r3)
LOADX (r8, r1)
LOADX (r10, r1)
LOADX (r4, r1)
@ -117,9 +109,7 @@ ENTRY_CFI(memcpy)
lpnz @.Lcopy8bytes_1
;; LOOP START
ld.ab r6, [r1, 4]
prefetch [r1, 28] ;Prefetch the next read location
ld.ab r8, [r1,4]
prefetchw [r3, 32] ;Prefetch the next write location
SHIFT_1 (r7, r6, 24)
or r7, r7, r5
@ -162,9 +152,7 @@ ENTRY_CFI(memcpy)
lpnz @.Lcopy8bytes_2
;; LOOP START
ld.ab r6, [r1, 4]
prefetch [r1, 28] ;Prefetch the next read location
ld.ab r8, [r1,4]
prefetchw [r3, 32] ;Prefetch the next write location
SHIFT_1 (r7, r6, 16)
or r7, r7, r5
@ -204,9 +192,7 @@ ENTRY_CFI(memcpy)
lpnz @.Lcopy8bytes_3
;; LOOP START
ld.ab r6, [r1, 4]
prefetch [r1, 28] ;Prefetch the next read location
ld.ab r8, [r1,4]
prefetchw [r3, 32] ;Prefetch the next write location
SHIFT_1 (r7, r6, 8)
or r7, r7, r5

View file

@ -7,11 +7,39 @@
*/
#include <linux/linkage.h>
#include <asm/cache.h>
#undef PREALLOC_NOT_AVAIL
/*
* The memset implementation below is optimized to use prefetchw and prealloc
* instruction in case of CPU with 64B L1 data cache line (L1_CACHE_SHIFT == 6)
* If you want to implement optimized memset for other possible L1 data cache
* line lengths (32B and 128B) you should rewrite code carefully checking
* we don't call any prefetchw/prealloc instruction for L1 cache lines which
* don't belongs to memset area.
*/
#if L1_CACHE_SHIFT == 6
.macro PREALLOC_INSTR reg, off
prealloc [\reg, \off]
.endm
.macro PREFETCHW_INSTR reg, off
prefetchw [\reg, \off]
.endm
#else
.macro PREALLOC_INSTR
.endm
.macro PREFETCHW_INSTR
.endm
#endif
ENTRY_CFI(memset)
prefetchw [r0] ; Prefetch the write location
PREFETCHW_INSTR r0, 0 ; Prefetch the first write location
mov.f 0, r2
;;; if size is zero
jz.d [blink]
@ -48,11 +76,8 @@ ENTRY_CFI(memset)
lpnz @.Lset64bytes
;; LOOP START
#ifdef PREALLOC_NOT_AVAIL
prefetchw [r3, 64] ;Prefetch the next write location
#else
prealloc [r3, 64]
#endif
PREALLOC_INSTR r3, 64 ; alloc next line w/o fetching
#ifdef CONFIG_ARC_HAS_LL64
std.ab r4, [r3, 8]
std.ab r4, [r3, 8]
@ -85,7 +110,6 @@ ENTRY_CFI(memset)
lsr.f lp_count, r2, 5 ;Last remaining max 124 bytes
lpnz .Lset32bytes
;; LOOP START
prefetchw [r3, 32] ;Prefetch the next write location
#ifdef CONFIG_ARC_HAS_LL64
std.ab r4, [r3, 8]
std.ab r4, [r3, 8]

View file

@ -840,7 +840,7 @@ void flush_cache_mm(struct mm_struct *mm)
void flush_cache_page(struct vm_area_struct *vma, unsigned long u_vaddr,
unsigned long pfn)
{
unsigned int paddr = pfn << PAGE_SHIFT;
phys_addr_t paddr = pfn << PAGE_SHIFT;
u_vaddr &= PAGE_MASK;
@ -860,8 +860,9 @@ void flush_anon_page(struct vm_area_struct *vma, struct page *page,
unsigned long u_vaddr)
{
/* TBD: do we really need to clear the kernel mapping */
__flush_dcache_page(page_address(page), u_vaddr);
__flush_dcache_page(page_address(page), page_address(page));
__flush_dcache_page((phys_addr_t)page_address(page), u_vaddr);
__flush_dcache_page((phys_addr_t)page_address(page),
(phys_addr_t)page_address(page));
}

View file

@ -890,9 +890,11 @@ void do_tlb_overlap_fault(unsigned long cause, unsigned long address,
struct pt_regs *regs)
{
struct cpuinfo_arc_mmu *mmu = &cpuinfo_arc700[smp_processor_id()].mmu;
unsigned int pd0[mmu->ways];
unsigned long flags;
int set;
int set, n_ways = mmu->ways;
n_ways = min(n_ways, 4);
BUG_ON(mmu->ways > 4);
local_irq_save(flags);
@ -900,9 +902,10 @@ void do_tlb_overlap_fault(unsigned long cause, unsigned long address,
for (set = 0; set < mmu->sets; set++) {
int is_valid, way;
unsigned int pd0[4];
/* read out all the ways of current set */
for (way = 0, is_valid = 0; way < mmu->ways; way++) {
for (way = 0, is_valid = 0; way < n_ways; way++) {
write_aux_reg(ARC_REG_TLBINDEX,
SET_WAY_TO_IDX(mmu, set, way));
write_aux_reg(ARC_REG_TLBCOMMAND, TLBRead);
@ -916,14 +919,14 @@ void do_tlb_overlap_fault(unsigned long cause, unsigned long address,
continue;
/* Scan the set for duplicate ways: needs a nested loop */
for (way = 0; way < mmu->ways - 1; way++) {
for (way = 0; way < n_ways - 1; way++) {
int n;
if (!pd0[way])
continue;
for (n = way + 1; n < mmu->ways; n++) {
for (n = way + 1; n < n_ways; n++) {
if (pd0[way] != pd0[n])
continue;

View file

@ -21,6 +21,7 @@
#error "Incorrect ctop.h include"
#endif
#include <linux/types.h>
#include <soc/nps/common.h>
/* core auxiliary registers */

View file

@ -1457,6 +1457,7 @@ config NR_CPUS
config HOTPLUG_CPU
bool "Support for hot-pluggable CPUs"
depends on SMP
select GENERIC_IRQ_MIGRATION
help
Say Y here to experiment with turning CPUs off and on. CPUs
can be controlled through /sys/devices/system/cpu.

View file

@ -987,14 +987,21 @@ choice
Say Y here if you want kernel low-level debugging support
on SOCFPGA(Cyclone 5 and Arria 5) based platforms.
config DEBUG_SOCFPGA_UART1
config DEBUG_SOCFPGA_ARRIA10_UART1
depends on ARCH_SOCFPGA
bool "Use SOCFPGA UART1 for low-level debug"
bool "Use SOCFPGA Arria10 UART1 for low-level debug"
select DEBUG_UART_8250
help
Say Y here if you want kernel low-level debugging support
on SOCFPGA(Arria 10) based platforms.
config DEBUG_SOCFPGA_CYCLONE5_UART1
depends on ARCH_SOCFPGA
bool "Use SOCFPGA Cyclone 5 UART1 for low-level debug"
select DEBUG_UART_8250
help
Say Y here if you want kernel low-level debugging support
on SOCFPGA(Cyclone 5 and Arria 5) based platforms.
config DEBUG_SUN9I_UART0
bool "Kernel low-level debugging messages via sun9i UART0"
@ -1340,21 +1347,21 @@ config DEBUG_OMAP2PLUS_UART
depends on ARCH_OMAP2PLUS
config DEBUG_IMX_UART_PORT
int "i.MX Debug UART Port Selection" if DEBUG_IMX1_UART || \
DEBUG_IMX25_UART || \
DEBUG_IMX21_IMX27_UART || \
DEBUG_IMX31_UART || \
DEBUG_IMX35_UART || \
DEBUG_IMX50_UART || \
DEBUG_IMX51_UART || \
DEBUG_IMX53_UART || \
DEBUG_IMX6Q_UART || \
DEBUG_IMX6SL_UART || \
DEBUG_IMX6SX_UART || \
DEBUG_IMX6UL_UART || \
DEBUG_IMX7D_UART
int "i.MX Debug UART Port Selection"
depends on DEBUG_IMX1_UART || \
DEBUG_IMX25_UART || \
DEBUG_IMX21_IMX27_UART || \
DEBUG_IMX31_UART || \
DEBUG_IMX35_UART || \
DEBUG_IMX50_UART || \
DEBUG_IMX51_UART || \
DEBUG_IMX53_UART || \
DEBUG_IMX6Q_UART || \
DEBUG_IMX6SL_UART || \
DEBUG_IMX6SX_UART || \
DEBUG_IMX6UL_UART || \
DEBUG_IMX7D_UART
default 1
depends on ARCH_MXC
help
Choose UART port on which kernel low-level debug messages
should be output.
@ -1534,7 +1541,8 @@ config DEBUG_UART_PHYS
default 0xfe800000 if ARCH_IOP32X
default 0xff690000 if DEBUG_RK32_UART2
default 0xffc02000 if DEBUG_SOCFPGA_UART0
default 0xffc02100 if DEBUG_SOCFPGA_UART1
default 0xffc02100 if DEBUG_SOCFPGA_ARRIA10_UART1
default 0xffc03000 if DEBUG_SOCFPGA_CYCLONE5_UART1
default 0xffd82340 if ARCH_IOP13XX
default 0xffe40000 if DEBUG_RCAR_GEN1_SCIF0
default 0xffe42000 if DEBUG_RCAR_GEN1_SCIF2
@ -1624,7 +1632,8 @@ config DEBUG_UART_VIRT
default 0xfeb30c00 if DEBUG_KEYSTONE_UART0
default 0xfeb31000 if DEBUG_KEYSTONE_UART1
default 0xfec02000 if DEBUG_SOCFPGA_UART0
default 0xfec02100 if DEBUG_SOCFPGA_UART1
default 0xfec02100 if DEBUG_SOCFPGA_ARRIA10_UART1
default 0xfec03000 if DEBUG_SOCFPGA_CYCLONE5_UART1
default 0xfec12000 if (DEBUG_MVEBU_UART0 || DEBUG_MVEBU_UART0_ALTERNATE) && ARCH_MVEBU
default 0xfec12100 if DEBUG_MVEBU_UART1_ALTERNATE
default 0xfec10000 if DEBUG_SIRFATLAS7_UART0
@ -1672,9 +1681,9 @@ config DEBUG_UART_8250_WORD
depends on DEBUG_LL_UART_8250 || DEBUG_UART_8250
depends on DEBUG_UART_8250_SHIFT >= 2
default y if DEBUG_PICOXCELL_UART || \
DEBUG_SOCFPGA_UART0 || DEBUG_SOCFPGA_UART1 || \
DEBUG_KEYSTONE_UART0 || DEBUG_KEYSTONE_UART1 || \
DEBUG_ALPINE_UART0 || \
DEBUG_SOCFPGA_UART0 || DEBUG_SOCFPGA_ARRIA10_UART1 || \
DEBUG_SOCFPGA_CYCLONE5_UART1 || DEBUG_KEYSTONE_UART0 || \
DEBUG_KEYSTONE_UART1 || DEBUG_ALPINE_UART0 || \
DEBUG_DAVINCI_DMx_UART0 || DEBUG_DAVINCI_DA8XX_UART1 || \
DEBUG_DAVINCI_DA8XX_UART2 || \
DEBUG_BCM_KONA_UART || DEBUG_RK32_UART2

View file

@ -104,7 +104,7 @@ tune-$(CONFIG_CPU_V6K) =$(call cc-option,-mtune=arm1136j-s,-mtune=strongarm)
tune-y := $(tune-y)
ifeq ($(CONFIG_AEABI),y)
CFLAGS_ABI :=-mabi=aapcs-linux -mno-thumb-interwork -mfpu=vfp
CFLAGS_ABI :=-mabi=aapcs-linux -mfpu=vfp
else
CFLAGS_ABI :=$(call cc-option,-mapcs-32,-mabi=apcs-gnu) $(call cc-option,-mno-thumb-interwork,)
endif

View file

@ -112,7 +112,7 @@ CFLAGS_fdt_ro.o := $(nossp_flags)
CFLAGS_fdt_rw.o := $(nossp_flags)
CFLAGS_fdt_wip.o := $(nossp_flags)
ccflags-y := -fpic -mno-single-pic-base -fno-builtin -I$(obj)
ccflags-y := -fpic $(call cc-option,-mno-single-pic-base,) -fno-builtin -I$(obj)
asflags-y := -DZIMAGE
# Supply kernel BSS size to the decompressor via a linker symbol.

View file

@ -17,14 +17,13 @@
@ there.
.inst 'M' | ('Z' << 8) | (0x1310 << 16) @ tstne r0, #0x4d000
#else
mov r0, r0
AR_CLASS( mov r0, r0 )
M_CLASS( nop.w )
#endif
.endm
.macro __EFI_HEADER
#ifdef CONFIG_EFI_STUB
b __efi_start
.set start_offset, __efi_start - start
.org start + 0x3c
@

View file

@ -130,19 +130,22 @@ start:
.rept 7
__nop
.endr
ARM( mov r0, r0 )
ARM( b 1f )
THUMB( badr r12, 1f )
THUMB( bx r12 )
#ifndef CONFIG_THUMB2_KERNEL
mov r0, r0
#else
AR_CLASS( sub pc, pc, #3 ) @ A/R: switch to Thumb2 mode
M_CLASS( nop.w ) @ M: already in Thumb2 mode
.thumb
#endif
W(b) 1f
.word _magic_sig @ Magic numbers to help the loader
.word _magic_start @ absolute load/run zImage address
.word _magic_end @ zImage end address
.word 0x04030201 @ endianness flag
THUMB( .thumb )
1: __EFI_HEADER
__EFI_HEADER
1:
ARM_BE8( setend be ) @ go BE8 if compiled for BE8
AR_CLASS( mrs r9, cpsr )
#ifdef CONFIG_ARM_VIRT_EXT
@ -1382,7 +1385,21 @@ ENTRY(efi_stub_entry)
@ Preserve return value of efi_entry() in r4
mov r4, r0
bl cache_clean_flush
@ our cache maintenance code relies on CP15 barrier instructions
@ but since we arrived here with the MMU and caches configured
@ by UEFI, we must check that the CP15BEN bit is set in SCTLR.
@ Note that this bit is RAO/WI on v6 and earlier, so the ISB in
@ the enable path will be executed on v7+ only.
mrc p15, 0, r1, c1, c0, 0 @ read SCTLR
tst r1, #(1 << 5) @ CP15BEN bit set?
bne 0f
orr r1, r1, #(1 << 5) @ CP15 barrier instructions
mcr p15, 0, r1, c1, c0, 0 @ write SCTLR
ARM( .inst 0xf57ff06f @ v7+ isb )
THUMB( isb )
0: bl cache_clean_flush
bl cache_off
@ Set parameters for booting zImage according to boot protocol

View file

@ -1,10 +1,14 @@
#ifndef _ARM_LIBFDT_ENV_H
#define _ARM_LIBFDT_ENV_H
#include <linux/limits.h>
#include <linux/types.h>
#include <linux/string.h>
#include <asm/byteorder.h>
#define INT32_MAX S32_MAX
#define UINT32_MAX U32_MAX
typedef __be16 fdt16_t;
typedef __be32 fdt32_t;
typedef __be64 fdt64_t;

View file

@ -701,6 +701,7 @@
pinctrl-0 = <&cpsw_default>;
pinctrl-1 = <&cpsw_sleep>;
status = "okay";
slaves = <1>;
};
&davinci_mdio {
@ -708,15 +709,14 @@
pinctrl-0 = <&davinci_mdio_default>;
pinctrl-1 = <&davinci_mdio_sleep>;
status = "okay";
ethphy0: ethernet-phy@0 {
reg = <0>;
};
};
&cpsw_emac0 {
phy_id = <&davinci_mdio>, <0>;
phy-mode = "rgmii-txid";
};
&cpsw_emac1 {
phy_id = <&davinci_mdio>, <1>;
phy-handle = <&ethphy0>;
phy-mode = "rgmii-txid";
};

View file

@ -74,6 +74,11 @@
};
};
/* Table Table 5-79 of the TRM shows 480ab000 is reserved */
&usb_otg_hs {
status = "disabled";
};
&iva {
status = "disabled";
};

View file

@ -1117,6 +1117,8 @@
ti,hwmods = "dss_dispc";
clocks = <&disp_clk>;
clock-names = "fck";
max-memory-bandwidth = <230000000>;
};
rfbi: rfbi@4832a800 {

View file

@ -79,7 +79,7 @@
};
lcd0: display {
compatible = "osddisplays,osd057T0559-34ts", "panel-dpi";
compatible = "osddisplays,osd070t1718-19ts", "panel-dpi";
label = "lcd";
panel-timing {

View file

@ -533,6 +533,8 @@
touchscreen-size-x = <480>;
touchscreen-size-y = <272>;
wakeup-source;
};
tlv320aic3106: tlv320aic3106@1b {

View file

@ -41,7 +41,7 @@
};
lcd0: display {
compatible = "osddisplays,osd057T0559-34ts", "panel-dpi";
compatible = "osddisplays,osd070t1718-19ts", "panel-dpi";
label = "lcd";
panel-timing {

View file

@ -334,7 +334,7 @@
clock-names = "uartclk", "apb_pclk";
};
ssp: ssp@1000d000 {
ssp: spi@1000d000 {
compatible = "arm,pl022", "arm,primecell";
reg = <0x1000d000 0x1000>;
clocks = <&sspclk>, <&pclk>;

Some files were not shown because too many files have changed in this diff Show more