diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/Makefile linux-2.4-xfs/fs/xfs/Makefile --- linux-2.4.19/fs/xfs/Makefile Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/Makefile Tue Aug 6 16:50:35 2002 @@ -0,0 +1,155 @@ +# +# Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write the Free Software Foundation, Inc., 59 +# Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, +# Mountain View, CA 94043, or: +# +# http://www.sgi.com +# +# For further information regarding this notice, see: +# +# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ +# +# Makefile for XFS on Linux. +# + +# This needs -I. because everything does #include instead of "xfs.h". +# The code is wrong, local files should be included using "xfs.h", not +# but I am not going to change every file at the moment. +EXTRA_CFLAGS += -I. -funsigned-char + +ifeq ($(CONFIG_XFS_DEBUG),y) + EXTRA_CFLAGS += -g -DSTATIC="" -DDEBUG -DXFSDEBUG +endif +ifeq ($(CONFIG_PAGEBUF_DEBUG),y) + EXTRA_CFLAGS += -DPAGEBUF_TRACE +endif + +subdir-$(CONFIG_XFS_FS) += pagebuf linux support + +ifeq ($(CONFIG_XFS_DMAPI),y) + subdir-$(CONFIG_XFS_FS) += dmapi +endif + +# fs/Makefile enters fs/xfs twice if CONFIG_XFS_FS is y, once for kernel and +# once for modules. This is necessary because xfsidbg can be built as a module +# even if xfs is in kernel. Alas the shorthand form +# O_TARGET := xfs.o +# obj-m := $(O_TARGET) +# fails when the makefile is run more than once, code gets compiled as both +# kernel and as module, which one gets linked depends on the phase of the moon. +# I just love these layer violations where a makefile behaves differently +# depending on changes to its parent. Work around by only setting obj-m when +# xfs is selected as a module. Keith Owens. + +O_TARGET := xfs.o +ifeq ($(CONFIG_XFS_FS),m) + obj-m := $(O_TARGET) +endif + +obj-$(CONFIG_XFS_DMAPI) += xfs_dmapi.o dmapi/dmapi_core.o + +obj-$(CONFIG_XFS_RT) += xfs_rtalloc.o + +obj-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \ + xfs_dquot_item.o \ + xfs_trans_dquot.o \ + xfs_qm_syscalls.o \ + xfs_qm.o + +obj-$(CONFIG_FS_POSIX_ACL) += xfs_acl.o +obj-$(CONFIG_FS_POSIX_CAP) += xfs_cap.o +obj-$(CONFIG_FS_POSIX_MAC) += xfs_mac.o + +obj-y += xfs_alloc.o \ + xfs_alloc_btree.o \ + xfs_attr.o \ + xfs_attr_fetch.o \ + xfs_attr_leaf.o \ + xfs_bit.o \ + xfs_bmap.o \ + xfs_bmap_btree.o \ + xfs_btree.o \ + xfs_buf_item.o \ + xfs_da_btree.o \ + xfs_dir.o \ + xfs_dir2.o \ + xfs_dir2_block.o \ + xfs_dir2_data.o \ + xfs_dir2_leaf.o \ + xfs_dir2_node.o \ + xfs_dir2_sf.o \ + xfs_dir2_trace.o \ + xfs_dir_leaf.o \ + xfs_error.o \ + xfs_extfree_item.o \ + xfs_fsops.o \ + xfs_ialloc.o \ + xfs_ialloc_btree.o \ + xfs_iget.o \ + xfs_inode.o \ + xfs_inode_item.o \ + xfs_iocore.o \ + xfs_itable.o \ + xfs_dfrag.o \ + xfs_log.o \ + xfs_log_recover.o \ + xfs_macros.o \ + xfs_mount.o \ + xfs_rename.o \ + xfs_trans.o \ + xfs_trans_ail.o \ + xfs_trans_buf.o \ + xfs_trans_extfree.o \ + xfs_trans_inode.o \ + xfs_trans_item.o \ + xfs_utils.o \ + xfs_vfsops.o \ + xfs_vnodeops.o \ + xfs_rw.o + +# Objects not built in this directory +obj-y += pagebuf/pagebuf.o \ + linux/linux_xfs.o \ + support/support_xfs.o + +# If both xfs and kdb modules are built in then xfsidbg is built in. If xfs is +# a module and kdb modules are being compiled then xfsidbg must be a module, to +# follow xfs. If xfs is built in then xfsidbg tracks the kdb module state. +# This must come after the main xfs code so xfs initialises before xfsidbg. +# KAO +ifneq ($(CONFIG_KDB_MODULES),) + ifeq ($(CONFIG_XFS_FS),y) + obj-$(CONFIG_KDB_MODULES) += xfsidbg.o + else + obj-$(CONFIG_XFS_FS) += xfsidbg.o + endif +endif + +CFLAGS_xfsidbg.o += -I $(TOPDIR)/arch/$(ARCH)/kdb + +include $(TOPDIR)/Rules.make + +# This is really nasty, but Rules.make was never designed for multi directory +# modules. Keith Owens. + +xfs.o: $(patsubst %,_modsubdir_%,$(subdir-m)) diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/Makefile.in linux-2.4-xfs/fs/xfs/Makefile.in --- linux-2.4.19/fs/xfs/Makefile.in Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/Makefile.in Wed Jul 31 11:49:51 2002 @@ -0,0 +1,87 @@ +# +# Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write the Free Software Foundation, Inc., 59 +# Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, +# Mountain View, CA 94043, or: +# +# http://www.sgi.com +# +# For further information regarding this notice, see: +# +# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ +# +# Makefile for XFS on Linux. +# + +objlink(CONFIG_XFS_DMAPI xfs.o xfs_dmapi.o dmapi/dmapi_core.o) + +objlink(CONFIG_XFS_RT xfs.o xfs_rtalloc.o) + +objlink(CONFIG_XFS_QUOTA xfs.o xfs_dquot.o xfs_dquot_item.o xfs_trans_dquot.o xfs_qm_syscalls.o xfs_qm.o) + +objlink(CONFIG_FS_POSIX_ACL xfs.o xfs_acl.o) +objlink(CONFIG_FS_POSIX_CAP xfs.o xfs_cap.o) +objlink(CONFIG_FS_POSIX_MAC xfs.o xfs_mac.o) + +objlink(xfs.o xfs_alloc.o xfs_alloc_btree.o xfs_attr.o xfs_attr_fetch.o + xfs_attr_leaf.o xfs_bit.o xfs_bmap.o xfs_bmap_btree.o xfs_btree.o + xfs_buf_item.o xfs_da_btree.o xfs_dir.o xfs_dir2.o xfs_dir2_block.o + xfs_dir2_data.o xfs_dir2_leaf.o xfs_dir2_node.o xfs_dir2_sf.o + xfs_dir2_trace.o xfs_dir_leaf.o xfs_error.o xfs_extfree_item.o + xfs_fsops.o xfs_ialloc.o xfs_ialloc_btree.o xfs_iget.o xfs_inode.o + xfs_inode_item.o xfs_iocore.o xfs_itable.o xfs_dfrag.o xfs_log.o + xfs_log_recover.o xfs_macros.o xfs_mount.o xfs_rename.o xfs_trans.o + xfs_trans_ail.o xfs_trans_buf.o xfs_trans_extfree.o xfs_trans_inode.o + xfs_trans_item.o xfs_utils.o xfs_vfsops.o xfs_vnodeops.o xfs_rw.o) + +# Objects not built in this directory +objlink(xfs.o pagebuf/pagebuf.o linux/linux_xfs.o support/support_xfs.o) + +select(CONFIG_XFS_FS xfs.o) + +# If both xfs and kdb modules are built in then xfsidbg is built in. If xfs is +# a module and kdb modules are being compiled then xfsidbg must be a module, to +# follow xfs. If xfs is built in then xfsidbg tracks the kdb module state. +# This must come after the main xfs code so xfs initialises before xfsidbg. +# KAO +ifsel(CONFIG_KDB_MODULES) + ifselnmod(CONFIG_XFS_FS) + select(CONFIG_KDB_MODULES xfsidbg.o) + else + select(CONFIG_XFS_FS xfsidbg.o) + endif +endif + +XFS_EXTRA_CFLAGS := $(src_includelist /fs/xfs) -funsigned-char +ifsel(CONFIG_XFS_DEBUG) + XFS_EXTRA_CFLAGS += -g -DSTATIC="" -DDEBUG -DXFSDEBUG +endif +ifsel(CONFIG_PAGEBUF_DEBUG) + XFS_EXTRA_CFLAGS += -DPAGEBUF_TRACE +endif +extra_cflags_all($(XFS_EXTRA_CFLAGS)) + +extra_cflags(xfsidbg.o $(src_includelist /arch/$(ARCH)/kdb)) + +# FIXME: xfsidbg includes "pagebuf/page_buf_internal.h" which includes +# "page_buf.h" and "page_buf_trace.h" +extra_cflags(xfsidbg.o $(src_includelist pagebuf)) diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/dmapi/Makefile linux-2.4-xfs/fs/xfs/dmapi/Makefile --- linux-2.4.19/fs/xfs/dmapi/Makefile Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/dmapi/Makefile Sun Aug 4 13:36:07 2002 @@ -0,0 +1,59 @@ +# +# Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write the Free Software Foundation, Inc., 59 +# Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, +# Mountain View, CA 94043, or: +# +# http://www.sgi.com +# +# For further information regarding this notice, see: +# +# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ +# + +EXTRA_CFLAGS += -I $(TOPDIR)/fs/xfs + +ifeq ($(CONFIG_XFS_DEBUG),y) + EXTRA_CFLAGS += -g -DDEBUG -DXFSDEBUG +endif + +O_TARGET := dmapi_core.o +ifneq ($(MAKECMDGOALS),modules_install) + obj-m := $(O_TARGET) +endif + +obj-y += dmapi_sysent.o \ + dmapi_attr.o \ + dmapi_config.o \ + dmapi_bulkattr.o \ + dmapi_dmattr.o \ + dmapi_event.o \ + dmapi_handle.o \ + dmapi_hole.o \ + dmapi_io.o \ + dmapi_mountinfo.o \ + dmapi_region.o \ + dmapi_register.o \ + dmapi_right.o \ + dmapi_session.o + +include $(TOPDIR)/Rules.make diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/dmapi/Makefile.in linux-2.4-xfs/fs/xfs/dmapi/Makefile.in --- linux-2.4.19/fs/xfs/dmapi/Makefile.in Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/dmapi/Makefile.in Thu Aug 1 12:24:25 2002 @@ -0,0 +1,41 @@ +# +# +# Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write the Free Software Foundation, Inc., 59 +# Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, +# Mountain View, CA 94043, or: +# +# http://www.sgi.com +# +# For further information regarding this notice, see: +# +# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + +objlink(dmapi_core.o dmapi_sysent.o dmapi_attr.o dmapi_config.o dmapi_bulkattr.o + dmapi_dmattr.o dmapi_event.o dmapi_handle.o dmapi_hole.o dmapi_io.o + dmapi_mountinfo.o dmapi_region.o dmapi_register.o dmapi_right.o + dmapi_session.o) + +# No select() for dmapi in this directory. It is a sub-component of XFS, +# see fs/xfs/Makefile.in for the objlink. + +extra_cflags_all($(src_includelist /fs/xfs)) diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/dmapi/Status linux-2.4-xfs/fs/xfs/dmapi/Status --- linux-2.4.19/fs/xfs/dmapi/Status Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/dmapi/Status Tue Jul 30 18:08:15 2002 @@ -0,0 +1,126 @@ +for linux: + + +68 external interfaces in libdm + + 56 of those interfaces go through to dmi(), the kernel side of DMAPI + + + +Functions known to work +---------------------------------------------- + +dm_create_session +dm_create_userevent +dm_destroy_session +dm_getall_sessions +dm_getall_tokens +dm_get_allocinfo +dm_get_bulkattr +dm_get_config_events +dm_get_dmattr +dm_get_eventlist +dm_get_events +dm_get_fileattr +dm_get_region +dm_handle_free +dm_init_attrloc +dm_init_service +dm_obj_ref_hold +dm_obj_ref_query +dm_obj_ref_rele +dm_path_to_fshandle +dm_path_to_handle +dm_punch_hole +dm_query_session +dm_read_invis +dm_remove_dmattr +dm_respond_event +dm_send_msg +dm_set_disp +dm_set_dmattr +dm_set_eventlist +dm_set_fileattr +dm_set_region +dm_sync_by_handle +dm_write_invis +34 + +Functions that seem to work (would like more rigorous test case) +------------------------------------------ + +dm_pending +dm_probe_hole - one test case of test_hole.c fails +dm_request_right +3 + +Functions untested but probably work +---------------------------------------------- + +dm_find_eventmsg +dm_handle_cmp +dm_handle_to_fshandle +dm_handle_to_ino +dm_release_right +5 + +Functions that do not work +----------------------------------------- + +dm_get_dioinfo - directio not implemented +1 + +Functions not supported in SGI DMAPI +------------------------------------------------------------- + +dm_clear_inherit +dm_create_by_handle +dm_getall_inherit +dm_get_bulkall +dm_mkdir_by_handle +dm_set_inherit +dm_symlink_by_handle + + + + +Functions that seem to work (would like more rigorous test case) +---------------------------------------------------------------- + +dm_get_config +dm_downgrade_right +dm_get_mountinfo +dm_set_return_on_destory +dm_upgrade_right + + + +Functions that do not work +----------------------------------------------------------------- + +dm_fd_to_handle - Irix getf not implemented on linux +dm_get_dirattrs - null pointer reference +dm_handle_to_path +dm_getall_dmattr - needs a copy_from_user in place of useracc + + +Functions that are untested, but probably work +----------------------------------------------------------------- + +dm_getall_disp +dm_handle_hash +dm_handle_is_valid +dm_handle_to_fsid +dm_handle_to_igen +dm_make_fshandle +dm_make_handle +dm_move_event +dm_query_right + + + +Other things not working +---------------------------------- + +- read/write events for memory-mapped I/O? + diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/dmapi/dmapi.h linux-2.4-xfs/fs/xfs/dmapi/dmapi.h --- linux-2.4.19/fs/xfs/dmapi/dmapi.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/dmapi/dmapi.h Tue Jul 30 18:09:44 2002 @@ -0,0 +1,1043 @@ +/* + * Copyright (c) 1995-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, write the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, + * USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#ifndef _SYS_DMAPI_H +#define _SYS_DMAPI_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef __KERNEL__ +#include +#endif +#include + +/************************************************************************** + * * + * The SGI implementation of DMAPI is based upon the X/Open document * + * Systems Management: Data Storage Managment (XDSM) API * + * dated February 1997. Not all DMAPI functions and structure fields * + * have been implemented. Most importantly, the DMAPI functions * + * dm_request_right, dm_release_right, dm_query_right, dm_upgrade_right * + * and dm_downgrade_right do not work as described in the specification. * + * * + * The XFS filesystem currently does not allow its locking mechanisms to * + * be externally accessed from user space. While the above-mentioned * + * dm_xxx_right functions exist and can be called by applications, they * + * always return successfully without actually obtaining any locks * + * within the filesystem. * + * * + * Applications which do not need full rights support and which only * + * make dm_xxx_right calls in order to satisfy the input requirements of * + * other DMAPI calls should be able to use these routines to avoid * + * having to implement special-case code for SGI platforms. Applications * + * which truely need the capabilities of a full implementation of rights * + * will unfortunately have to come up with alternate software solutions * + * until such time as rights can be completely implemented. * + * * + * Functions and structure fields defined within this file which are not * + * supported in the SGI implementation of DMAPI are indicated by comments * + * following their definitions such as "not supported", or "not * + * completely supported". Any function or field not so marked may be * + * assumed to work exactly according to the spec. * + * * + **************************************************************************/ + + + +/* The first portion of this file contains defines and typedefs that are + DMAPI implementation-dependent, and could be different on other platforms. +*/ + +typedef __s64 dm_attrloc_t; +typedef unsigned int dm_boolean_t; +typedef __u64 dm_eventset_t; +typedef __u64 dm_fsid_t; +typedef __u64 dm_ino_t; +typedef __u32 dm_igen_t; +typedef __s64 dm_off_t; +typedef unsigned int dm_sequence_t; +typedef int dm_sessid_t; +typedef __u64 dm_size_t; +typedef __s64 dm_ssize_t; +typedef int dm_token_t; + +/* XXX dev_t, mode_t, and nlink_t are not the same size in kernel space + and user space. This affects the field offsets for dm_stat_t. + The following solution is temporary. + + user space sizes: dev_t=8 mode_t=4 nlink_t=4 + kernel space : dev_t=2 mode_t=2 nlink_t=2 + +*/ +typedef __s64 dm_dev_t; +typedef int dm_mode_t; +typedef int dm_nlink_t; + + +#define DM_REGION_NOEVENT 0x0 +#define DM_REGION_READ 0x1 +#define DM_REGION_WRITE 0x2 +#define DM_REGION_TRUNCATE 0x4 + +/* Values for the mask argument used with dm_get_fileattr, dm_get_bulkattr, + dm_get_dirattrs, and dm_set_fileattr. +*/ + +#define DM_AT_MODE 0x0001 +#define DM_AT_UID 0x0002 +#define DM_AT_GID 0x0004 +#define DM_AT_ATIME 0x0008 +#define DM_AT_MTIME 0x0010 +#define DM_AT_CTIME 0x0020 +#define DM_AT_SIZE 0x0040 +#define DM_AT_DTIME 0x0080 +#define DM_AT_HANDLE 0x0100 +#define DM_AT_EMASK 0x0200 +#define DM_AT_PMANR 0x0400 +#define DM_AT_PATTR 0x0800 +#define DM_AT_STAT 0x1000 +#define DM_AT_CFLAG 0x2000 + +#define DM_EV_WAIT 0x1 /* used in dm_get_events() */ + +#define DM_MOUNT_RDONLY 0x1 /* me_mode field in dm_mount_event_t */ + +#define DM_RR_WAIT 0x1 + +#define DM_UNMOUNT_FORCE 0x1 /* ne_mode field in dm_namesp_event_t */ + +#define DM_WRITE_SYNC 0x1 /* used in dm_write_invis() */ + +#define DM_SESSION_INFO_LEN 256 +#define DM_NO_SESSION 0 +#define DM_TRUE 1 +#define DM_FALSE 0 +#define DM_INVALID_TOKEN 0 +#define DM_NO_TOKEN (-1) +#define DM_INVALID_HANP NULL +#define DM_INVALID_HLEN 0 +#define DM_GLOBAL_HANP ((void *)(1LL)) +#define DM_GLOBAL_HLEN ((size_t)(1)) +#define DM_VER_STR_CONTENTS "SGI DMAPI (XDSM) API, Release 1.0." + + +#define DMEV_SET(event_type, event_list) \ + ((event_list) |= (1 << (event_type))) +#define DMEV_CLR(event_type, event_list) \ + ((event_list) &= ~(1 << (event_type))) +#define DMEV_ISSET(event_type, event_list) \ + (int)(((event_list) & (1 << (event_type))) != 0) +#define DMEV_ZERO(event_list) \ + (event_list) = 0 + + +typedef struct { + int vd_offset; /* offset from start of containing struct */ + unsigned int vd_length; /* length of data starting at vd_offset */ +} dm_vardata_t; + +#define DM_GET_VALUE(p, field, type) \ + ((type) ((char *)(p) + (p)->field.vd_offset)) + +#define DM_GET_LEN(p, field) \ + ((p)->field.vd_length) + +#define DM_STEP_TO_NEXT(p, type) \ + ((type) ((p)->_link ? (char *)(p) + (p)->_link : NULL)) + + + + +/* The remainder of this include file contains defines, typedefs, and + structures which are strictly defined by the DMAPI 2.3 specification. + + (The _link field which appears in several structures is an + implementation-specific way to implement DM_STEP_TO_NEXT, and + should not be referenced directly by application code.) +*/ + + +#define DM_ATTR_NAME_SIZE 8 + + +struct dm_attrname { + unsigned char an_chars[DM_ATTR_NAME_SIZE]; +}; +typedef struct dm_attrname dm_attrname_t; + + +struct dm_attrlist { + int _link; + dm_attrname_t al_name; + dm_vardata_t al_data; +}; +typedef struct dm_attrlist dm_attrlist_t; + + +typedef enum { + DM_CONFIG_INVALID, + DM_CONFIG_BULKALL, + DM_CONFIG_CREATE_BY_HANDLE, + DM_CONFIG_DTIME_OVERLOAD, + DM_CONFIG_LEGACY, + DM_CONFIG_LOCK_UPGRADE, + DM_CONFIG_MAX_ATTR_ON_DESTROY, + DM_CONFIG_MAX_ATTRIBUTE_SIZE, + DM_CONFIG_MAX_HANDLE_SIZE, + DM_CONFIG_MAX_MANAGED_REGIONS, + DM_CONFIG_MAX_MESSAGE_DATA, + DM_CONFIG_OBJ_REF, + DM_CONFIG_PENDING, + DM_CONFIG_PERS_ATTRIBUTES, + DM_CONFIG_PERS_EVENTS, + DM_CONFIG_PERS_INHERIT_ATTRIBS, + DM_CONFIG_PERS_MANAGED_REGIONS, + DM_CONFIG_PUNCH_HOLE, + DM_CONFIG_TOTAL_ATTRIBUTE_SPACE, + DM_CONFIG_WILL_RETRY +} dm_config_t; + + +struct dm_dioinfo { /* non-standard SGI addition */ + unsigned int d_mem; + unsigned int d_miniosz; + unsigned int d_maxiosz; + dm_boolean_t d_dio_only; +}; +typedef struct dm_dioinfo dm_dioinfo_t; + + +struct dm_dispinfo { + int _link; + unsigned int di_pad1; /* reserved; do not reference */ + dm_vardata_t di_fshandle; + dm_eventset_t di_eventset; +}; +typedef struct dm_dispinfo dm_dispinfo_t; + + +typedef enum { + DM_EVENT_INVALID = -1, + DM_EVENT_CANCEL = 0, /* not supported */ + DM_EVENT_MOUNT = 1, + DM_EVENT_PREUNMOUNT = 2, + DM_EVENT_UNMOUNT = 3, + DM_EVENT_DEBUT = 4, /* not supported */ + DM_EVENT_CREATE = 5, + DM_EVENT_CLOSE = 6, /* not supported */ + DM_EVENT_POSTCREATE = 7, + DM_EVENT_REMOVE = 8, + DM_EVENT_POSTREMOVE = 9, + DM_EVENT_RENAME = 10, + DM_EVENT_POSTRENAME = 11, + DM_EVENT_LINK = 12, + DM_EVENT_POSTLINK = 13, + DM_EVENT_SYMLINK = 14, + DM_EVENT_POSTSYMLINK = 15, + DM_EVENT_READ = 16, + DM_EVENT_WRITE = 17, + DM_EVENT_TRUNCATE = 18, + DM_EVENT_ATTRIBUTE = 19, + DM_EVENT_DESTROY = 20, + DM_EVENT_NOSPACE = 21, + DM_EVENT_USER = 22, + DM_EVENT_MAX = 23 +} dm_eventtype_t; + + +struct dm_eventmsg { + int _link; + dm_eventtype_t ev_type; + dm_token_t ev_token; + dm_sequence_t ev_sequence; + dm_vardata_t ev_data; +}; +typedef struct dm_eventmsg dm_eventmsg_t; + + +struct dm_cancel_event { /* not supported */ + dm_sequence_t ce_sequence; + dm_token_t ce_token; +}; +typedef struct dm_cancel_event dm_cancel_event_t; + + +struct dm_data_event { + dm_vardata_t de_handle; + dm_off_t de_offset; + dm_size_t de_length; +}; +typedef struct dm_data_event dm_data_event_t; + +struct dm_destroy_event { + dm_vardata_t ds_handle; + dm_attrname_t ds_attrname; + dm_vardata_t ds_attrcopy; +}; +typedef struct dm_destroy_event dm_destroy_event_t; + +struct dm_mount_event { + dm_mode_t me_mode; + dm_vardata_t me_handle1; + dm_vardata_t me_handle2; + dm_vardata_t me_name1; + dm_vardata_t me_name2; + dm_vardata_t me_roothandle; +}; +typedef struct dm_mount_event dm_mount_event_t; + +struct dm_namesp_event { + dm_mode_t ne_mode; + dm_vardata_t ne_handle1; + dm_vardata_t ne_handle2; + dm_vardata_t ne_name1; + dm_vardata_t ne_name2; + int ne_retcode; +}; +typedef struct dm_namesp_event dm_namesp_event_t; + + +typedef enum { + DM_EXTENT_INVALID, + DM_EXTENT_RES, + DM_EXTENT_HOLE +} dm_extenttype_t; + + +struct dm_extent { + dm_extenttype_t ex_type; + unsigned int ex_pad1; /* reserved; do not reference */ + dm_off_t ex_offset; + dm_size_t ex_length; +}; +typedef struct dm_extent dm_extent_t; + +struct dm_fileattr { + dm_mode_t fa_mode; + uid_t fa_uid; + gid_t fa_gid; + time_t fa_atime; + time_t fa_mtime; + time_t fa_ctime; + time_t fa_dtime; + unsigned int fa_pad1; /* reserved; do not reference */ + dm_off_t fa_size; +}; +typedef struct dm_fileattr dm_fileattr_t; + + +struct dm_inherit { /* not supported */ + dm_attrname_t ih_name; + dm_mode_t ih_filetype; +}; +typedef struct dm_inherit dm_inherit_t; + + +typedef enum { + DM_MSGTYPE_INVALID, + DM_MSGTYPE_SYNC, + DM_MSGTYPE_ASYNC +} dm_msgtype_t; + + +struct dm_region { + dm_off_t rg_offset; + dm_size_t rg_size; + unsigned int rg_flags; + unsigned int rg_pad1; /* reserved; do not reference */ +}; +typedef struct dm_region dm_region_t; + + +typedef enum { + DM_RESP_INVALID, + DM_RESP_CONTINUE, + DM_RESP_ABORT, + DM_RESP_DONTCARE +} dm_response_t; + + +typedef enum { + DM_RIGHT_NULL, + DM_RIGHT_SHARED, + DM_RIGHT_EXCL +} dm_right_t; + + +struct dm_stat { + int _link; + dm_vardata_t dt_handle; + dm_vardata_t dt_compname; + int dt_nevents; + dm_eventset_t dt_emask; + int dt_pers; /* field not supported */ + int dt_pmanreg; + time_t dt_dtime; + unsigned int dt_change; /* field not supported */ + unsigned int dt_pad1; /* reserved; do not reference */ + dm_dev_t dt_dev; + dm_ino_t dt_ino; + dm_mode_t dt_mode; + dm_nlink_t dt_nlink; + uid_t dt_uid; + gid_t dt_gid; + dm_dev_t dt_rdev; + unsigned int dt_pad2; /* reserved; do not reference */ + dm_off_t dt_size; + time_t dt_atime; + time_t dt_mtime; + time_t dt_ctime; + unsigned int dt_blksize; + dm_size_t dt_blocks; + + /* Non-standard filesystem-specific fields. Currently XFS is the only + supported filesystem type. + */ + + __u64 dt_pad3; /* reserved; do not reference */ + int dt_fstype; /* filesystem index; see sysfs(2) */ + union { + struct { + dm_igen_t igen; + unsigned int xflags; + unsigned int extsize; + unsigned int extents; + unsigned short aextents; + unsigned short dmstate; + } sgi_xfs; + } fsys_dep; +}; +typedef struct dm_stat dm_stat_t; + +#define dt_xfs_igen fsys_dep.sgi_xfs.igen +#define dt_xfs_xflags fsys_dep.sgi_xfs.xflags +#define dt_xfs_extsize fsys_dep.sgi_xfs.extsize +#define dt_xfs_extents fsys_dep.sgi_xfs.extents +#define dt_xfs_aextents fsys_dep.sgi_xfs.aextents +#define dt_xfs_dmstate fsys_dep.sgi_xfs.dmstate + +/* Flags for the non-standard dt_xfs_xflags field. */ + +#define DM_XFLAG_REALTIME 0x1 +#define DM_XFLAG_PREALLOC 0x2 +#define DM_XFLAG_HASATTR 0x80000000 + + +struct dm_timestruct { + time_t dm_tv_sec; + int dm_tv_nsec; +}; +typedef struct dm_timestruct dm_timestruct_t; + + +struct dm_xstat { /* not supported */ + dm_stat_t dx_statinfo; + dm_vardata_t dx_attrdata; +}; +typedef struct dm_xstat dm_xstat_t; + + + +/* The following list provides the prototypes for all functions defined in + the DMAPI interface. +*/ + +extern int +dm_clear_inherit( /* not supported */ + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_attrname_t *attrnamep); + +extern int +dm_create_by_handle( /* not supported */ + dm_sessid_t sid, + void *dirhanp, + size_t dirhlen, + dm_token_t token, + void *hanp, + size_t hlen, + char *cname); + +extern int +dm_create_session( + dm_sessid_t oldsid, + char *sessinfop, + dm_sessid_t *newsidp); + +extern int +dm_create_userevent( + dm_sessid_t sid, + size_t msglen, + void *msgdatap, + dm_token_t *tokenp); + +extern int +dm_destroy_session( + dm_sessid_t sid); + +extern int +dm_downgrade_right( /* not completely supported; see caveat above */ + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token); + +extern int +dm_fd_to_handle( + int fd, + void **hanpp, + size_t *hlenp); + +extern int +dm_find_eventmsg( + dm_sessid_t sid, + dm_token_t token, + size_t buflen, + void *bufp, + size_t *rlenp); + +extern int +dm_get_allocinfo( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_off_t *offp, + unsigned int nelem, + dm_extent_t *extentp, + unsigned int *nelemp); + +extern int +dm_get_bulkall( /* not supported */ + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + unsigned int mask, + dm_attrname_t *attrnamep, + dm_attrloc_t *locp, + size_t buflen, + void *bufp, + size_t *rlenp); + +extern int +dm_get_bulkattr( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + unsigned int mask, + dm_attrloc_t *locp, + size_t buflen, + void *bufp, + size_t *rlenp); + +extern int +dm_get_config( + void *hanp, + size_t hlen, + dm_config_t flagname, + dm_size_t *retvalp); + +extern int +dm_get_config_events( + void *hanp, + size_t hlen, + unsigned int nelem, + dm_eventset_t *eventsetp, + unsigned int *nelemp); + +extern int +dm_get_dirattrs( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + unsigned int mask, + dm_attrloc_t *locp, + size_t buflen, + void *bufp, + size_t *rlenp); + +extern int +dm_get_dmattr( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_attrname_t *attrnamep, + size_t buflen, + void *bufp, + size_t *rlenp); + +extern int +dm_get_eventlist( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + unsigned int nelem, + dm_eventset_t *eventsetp, + unsigned int *nelemp); + +extern int +dm_get_events( + dm_sessid_t sid, + unsigned int maxmsgs, + unsigned int flags, + size_t buflen, + void *bufp, + size_t *rlenp); + +extern int +dm_get_fileattr( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + unsigned int mask, + dm_stat_t *statp); + +extern int +dm_get_mountinfo( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + size_t buflen, + void *bufp, + size_t *rlenp); + +extern int +dm_get_region( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + unsigned int nelem, + dm_region_t *regbufp, + unsigned int *nelemp); + +extern int +dm_getall_disp( + dm_sessid_t sid, + size_t buflen, + void *bufp, + size_t *rlenp); + +extern int +dm_getall_dmattr( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + size_t buflen, + void *bufp, + size_t *rlenp); + +extern int +dm_getall_inherit( /* not supported */ + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + unsigned int nelem, + dm_inherit_t *inheritbufp, + unsigned int *nelemp); + +extern int +dm_getall_sessions( + unsigned int nelem, + dm_sessid_t *sidbufp, + unsigned int *nelemp); + +extern int +dm_getall_tokens( + dm_sessid_t sid, + unsigned int nelem, + dm_token_t *tokenbufp, + unsigned int *nelemp); + +extern int +dm_handle_cmp( + void *hanp1, + size_t hlen1, + void *hanp2, + size_t hlen2); + +extern void +dm_handle_free( + void *hanp, + size_t hlen); + +extern u_int +dm_handle_hash( + void *hanp, + size_t hlen); + +extern dm_boolean_t +dm_handle_is_valid( + void *hanp, + size_t hlen); + +extern int +dm_handle_to_fshandle( + void *hanp, + size_t hlen, + void **fshanpp, + size_t *fshlenp); + +extern int +dm_handle_to_fsid( + void *hanp, + size_t hlen, + dm_fsid_t *fsidp); + +extern int +dm_handle_to_igen( + void *hanp, + size_t hlen, + dm_igen_t *igenp); + +extern int +dm_handle_to_ino( + void *hanp, + size_t hlen, + dm_ino_t *inop); + +extern int +dm_handle_to_path( + void *dirhanp, + size_t dirhlen, + void *targhanp, + size_t targhlen, + size_t buflen, + char *pathbufp, + size_t *rlenp); + +extern int +dm_init_attrloc( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_attrloc_t *locp); + +extern int +dm_init_service( + char **versionstrpp); + +extern int +dm_make_handle( + dm_fsid_t *fsidp, + dm_ino_t *inop, + dm_igen_t *igenp, + void **hanpp, + size_t *hlenp); + +extern int +dm_make_fshandle( + dm_fsid_t *fsidp, + void **hanpp, + size_t *hlenp); + +extern int +dm_mkdir_by_handle( /* not supported */ + dm_sessid_t sid, + void *dirhanp, + size_t dirhlen, + dm_token_t token, + void *hanp, + size_t hlen, + char *cname); + +extern int +dm_move_event( + dm_sessid_t srcsid, + dm_token_t token, + dm_sessid_t targetsid, + dm_token_t *rtokenp); + +extern int +dm_obj_ref_hold( + dm_sessid_t sid, + dm_token_t token, + void *hanp, + size_t hlen); + +extern int +dm_obj_ref_query( + dm_sessid_t sid, + dm_token_t token, + void *hanp, + size_t hlen); + +extern int +dm_obj_ref_rele( + dm_sessid_t sid, + dm_token_t token, + void *hanp, + size_t hlen); + +extern int +dm_path_to_fshandle( + char *path, + void **hanpp, + size_t *hlenp); + +extern int +dm_path_to_handle( + char *path, + void **hanpp, + size_t *hlenp); + +extern int +dm_pending( + dm_sessid_t sid, + dm_token_t token, + dm_timestruct_t *delay); + +extern int +dm_probe_hole( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_off_t off, + dm_size_t len, + dm_off_t *roffp, + dm_size_t *rlenp); + +extern int +dm_punch_hole( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_off_t off, + dm_size_t len); + +extern int +dm_query_right( /* not completely supported; see caveat above */ + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_right_t *rightp); + +extern int +dm_query_session( + dm_sessid_t sid, + size_t buflen, + void *bufp, + size_t *rlenp); + +extern dm_ssize_t +dm_read_invis( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_off_t off, + dm_size_t len, + void *bufp); + +extern int +dm_release_right( /* not completely supported; see caveat above */ + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token); + +extern int +dm_remove_dmattr( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + int setdtime, + dm_attrname_t *attrnamep); + +extern int +dm_request_right( /* not completely supported; see caveat above */ + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + unsigned int flags, + dm_right_t right); + +extern int +dm_respond_event( + dm_sessid_t sid, + dm_token_t token, + dm_response_t response, + int reterror, + size_t buflen, + void *respbufp); + +extern int +dm_send_msg( + dm_sessid_t targetsid, + dm_msgtype_t msgtype, + size_t buflen, + void *bufp); + +extern int +dm_set_disp( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_eventset_t *eventsetp, + unsigned int maxevent); + +extern int +dm_set_dmattr( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_attrname_t *attrnamep, + int setdtime, + size_t buflen, + void *bufp); + +extern int +dm_set_eventlist( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_eventset_t *eventsetp, + unsigned int maxevent); + +extern int +dm_set_fileattr( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + unsigned int mask, + dm_fileattr_t *attrp); + +extern int +dm_set_inherit( /* not supported */ + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_attrname_t *attrnamep, + mode_t mode); + +extern int +dm_set_region( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + unsigned int nelem, + dm_region_t *regbufp, + dm_boolean_t *exactflagp); + +extern int +dm_set_return_on_destroy( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_attrname_t *attrnamep, + dm_boolean_t enable); + +extern int +dm_symlink_by_handle( /* not supported */ + dm_sessid_t sid, + void *dirhanp, + size_t dirhlen, + dm_token_t token, + void *hanp, + size_t hlen, + char *cname, + char *path); + +extern int +dm_sync_by_handle( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token); + +extern int +dm_upgrade_right( /* not completely supported; see caveat above */ + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token); + +extern dm_ssize_t +dm_write_invis( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + int flags, + dm_off_t off, + dm_size_t len, + void *bufp); + +/* Non-standard SGI additions to the DMAPI interface. */ + +int +dm_open_by_handle( + void *hanp, + size_t hlen, + int mode); + +extern int +dm_get_dioinfo( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_dioinfo_t *diop); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMAPI_H */ + diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/dmapi/dmapi_attr.c linux-2.4-xfs/fs/xfs/dmapi/dmapi_attr.c --- linux-2.4.19/fs/xfs/dmapi/dmapi_attr.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/dmapi/dmapi_attr.c Tue Jul 30 18:08:15 2002 @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "dmapi_private.h" + + +/* Retrieve attributes for a single file, directory or symlink. */ + +int +dm_get_fileattr( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + u_int mask, + dm_stat_t *statp) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VNO, + DM_RIGHT_SHARED, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->get_fileattr(tdp->td_vp, tdp->td_right, + mask, statp); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} + + +/* Set one or more file attributes of a file, directory, or symlink. */ + +int +dm_set_fileattr( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + u_int mask, + dm_fileattr_t *attrp) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VNO, + DM_RIGHT_EXCL, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->set_fileattr(tdp->td_vp, tdp->td_right, + mask, attrp); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/dmapi/dmapi_bulkattr.c linux-2.4-xfs/fs/xfs/dmapi/dmapi_bulkattr.c --- linux-2.4.19/fs/xfs/dmapi/dmapi_bulkattr.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/dmapi/dmapi_bulkattr.c Tue Jul 30 18:08:15 2002 @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "dmapi_private.h" + + +int +dm_init_attrloc( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_attrloc_t *locp) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VFS|DM_TDT_DIR, + DM_RIGHT_SHARED, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->init_attrloc(tdp->td_vp, tdp->td_right, locp); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} + + +/* + * Retrieves both standard and DM specific file attributes for the file + * system indicated by the handle. (The FS has to be mounted). + * Syscall returns 1 to indicate SUCCESS and more information is available. + * -1 is returned on error, and errno will be set appropriately. + * 0 is returned upon successful completion. + */ + +int +dm_get_bulkattr_rvp( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + u_int mask, + dm_attrloc_t *locp, + size_t buflen, + void *bufp, + size_t *rlenp, + int *rvp) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VFS, + DM_RIGHT_SHARED, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->get_bulkattr_rvp(tdp->td_vp, tdp->td_right, + mask, locp, buflen, bufp, rlenp, rvp); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} + + +/* + * Retrieves attributes of directory entries given a handle to that + * directory. Iterative. + * Syscall returns 1 to indicate SUCCESS and more information is available. + * -1 is returned on error, and errno will be set appropriately. + * 0 is returned upon successful completion. + */ + +int +dm_get_dirattrs_rvp( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + u_int mask, + dm_attrloc_t *locp, + size_t buflen, + void *bufp, + size_t *rlenp, + int *rvp) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_DIR, + DM_RIGHT_SHARED, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->get_dirattrs_rvp(tdp->td_vp, tdp->td_right, + mask, locp, buflen, bufp, rlenp, rvp); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} + + +int +dm_get_bulkall_rvp( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + u_int mask, + dm_attrname_t *attrnamep, + dm_attrloc_t *locp, + size_t buflen, + void *bufp, + size_t *rlenp, + int *rvp) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VFS, + DM_RIGHT_SHARED, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->get_bulkall_rvp(tdp->td_vp, tdp->td_right, + mask, attrnamep, locp, buflen, bufp, rlenp, rvp); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/dmapi/dmapi_config.c linux-2.4-xfs/fs/xfs/dmapi/dmapi_config.c --- linux-2.4.19/fs/xfs/dmapi/dmapi_config.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/dmapi/dmapi_config.c Tue Jul 30 18:08:15 2002 @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "dmapi_private.h" + +int +dm_get_config( + void *hanp, + size_t hlen, + dm_config_t flagname, + dm_size_t *retvalp) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + dm_size_t retval; + int system = 1; + int error; + + /* Trap and process configuration parameters which are system-wide. */ + + switch (flagname) { + case DM_CONFIG_LEGACY: + case DM_CONFIG_PENDING: + case DM_CONFIG_OBJ_REF: + retval = DM_TRUE; + break; + case DM_CONFIG_MAX_MESSAGE_DATA: + retval = DM_MAX_MSG_DATA; + break; + default: + system = 0; + break; + } + if (system) { + if (copy_to_user(retvalp, &retval, sizeof(retval))) + return(EFAULT); + return(0); + } + + /* Must be filesystem-specific. Convert the handle into a vnode. */ + + if ((error = dm_get_config_tdp(hanp, hlen, &tdp)) != 0) + return(error); + + /* Now call the filesystem-specific routine to determine the + value of the configuration option for that filesystem. + */ + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->get_config(tdp->td_vp, tdp->td_right, + flagname, retvalp); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} + + +int +dm_get_config_events( + void *hanp, + size_t hlen, + u_int nelem, + dm_eventset_t *eventsetp, + u_int *nelemp) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + /* Convert the handle into a vnode. */ + + if ((error = dm_get_config_tdp(hanp, hlen, &tdp)) != 0) + return(error); + + /* Now call the filesystem-specific routine to determine the + events supported by that filesystem. + */ + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->get_config_events(tdp->td_vp, tdp->td_right, + nelem, eventsetp, nelemp); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/dmapi/dmapi_dmattr.c linux-2.4-xfs/fs/xfs/dmapi/dmapi_dmattr.c --- linux-2.4.19/fs/xfs/dmapi/dmapi_dmattr.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/dmapi/dmapi_dmattr.c Tue Jul 30 18:08:15 2002 @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "dmapi_private.h" + + +int +dm_clear_inherit( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_attrname_t *attrnamep) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VFS, + DM_RIGHT_EXCL, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->clear_inherit(tdp->td_vp, tdp->td_right, + attrnamep); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} + + +int +dm_get_dmattr( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_attrname_t *attrnamep, + size_t buflen, + void *bufp, + size_t *rlenp) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VNO, + DM_RIGHT_SHARED, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->get_dmattr(tdp->td_vp, tdp->td_right, + attrnamep, buflen, bufp, rlenp); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} + + +int +dm_getall_dmattr( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + size_t buflen, + void *bufp, + size_t *rlenp) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VNO, + DM_RIGHT_SHARED, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->getall_dmattr(tdp->td_vp, tdp->td_right, + buflen, bufp, rlenp); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} + + +int +dm_getall_inherit( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + u_int nelem, + dm_inherit_t *inheritbufp, + u_int *nelemp) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VFS, + DM_RIGHT_SHARED, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->getall_inherit(tdp->td_vp, tdp->td_right, + nelem, inheritbufp, nelemp); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} + + +int +dm_remove_dmattr( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + int setdtime, + dm_attrname_t *attrnamep) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VNO, + DM_RIGHT_EXCL, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->remove_dmattr(tdp->td_vp, tdp->td_right, + setdtime, attrnamep); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} + + +int +dm_set_dmattr( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_attrname_t *attrnamep, + int setdtime, + size_t buflen, + void *bufp) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VNO, + DM_RIGHT_EXCL, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->set_dmattr(tdp->td_vp, tdp->td_right, + attrnamep, setdtime, buflen, bufp); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} + + +int +dm_set_inherit( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_attrname_t *attrnamep, + mode_t mode) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VFS, + DM_RIGHT_EXCL, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->set_inherit(tdp->td_vp, tdp->td_right, + attrnamep, mode); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/dmapi/dmapi_event.c linux-2.4-xfs/fs/xfs/dmapi/dmapi_event.c --- linux-2.4.19/fs/xfs/dmapi/dmapi_event.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/dmapi/dmapi_event.c Thu Aug 1 12:24:25 2002 @@ -0,0 +1,859 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "dmapi_private.h" + +/* The "rights" portion of the DMAPI spec is not currently implemented. A + framework for rights is provided in the code, but turns out to be a noop + in practice. The following comments are a brain dump to serve as input to + the poor soul that eventually has to get DMAPI rights working in IRIX. + + A DMAPI right is similar but not identical to the mrlock_t mechanism + already used within the kernel. The similarities are that it is a + sleeping lock, and that a multiple-reader, single-writer protocol is used. + How locks are obtained and dropped are different however. With a mrlock_t, + a thread grabs the lock, does some stuff, then drops the lock, and all other + threads block in the meantime (assuming a write lock). There is a one-to- + one relationship between the lock and the thread which obtained the lock. + Not so with DMAPI right locks. A DMAPI lock is associated with a particular + session/token/hanp/hlen quad; since there is a dm_tokdata_t structure for + each such quad, you can think of it as a one-to-one relationship between the + lock and a dm_tokdata_t. Any application thread which presents the correct + quad is entitled to grab or release the lock, or to use the rights + associated with that lock. The thread that grabs the lock does not have to + be the one to use the lock, nor does it have to be the thread which drops + the lock. The lock can be held for very long periods of time, even across + multiple systems calls by multiple application threads. The idea is that a + coordinated group of DMAPI application threads can grab the lock, issue a + series of inode accesses and/or updates, then drop the lock, and be assured + that no other thread in the system could be modifying the inode at the same + time. The kernel is expected to blindly trust that the application will + not forget to unlock inodes it has locked, and will not deadlock itself + against the kernel. + + There are two types of DMAPI rights, file object (inode) and filesystem + object (superblock?). An inode right is the equivalent of the combination + of both the XFS ilock and iolock; if held exclusively, no data or metadata + within the file can be changed by non-lock-holding threads. The filesystem + object lock is a little fuzzier; I think that if it is held, things like + unmounts can be blocked, plus there is an event mask associated with the + filesystem which can't be updated without the lock. (By the way, that + event mask is supposed to be persistent in the superblock; add that to + your worklist :-) + + All events generated by XFS currently arrive with no rights, i.e. + DM_RIGHT_NULL, and return to the filesystem with no rights. It would be + smart to leave it this way if possible, because it otherwise becomes more + likely that an application thread will deadlock against the kernel if the + one responsible for calling dm_get_events() happens to touch a file which + was locked at the time the event was queued. Since the thread is blocked, + it can't read the event in order to find and drop the lock. Catch-22. If + you do have events that arrive with non-null rights, then dm_enqueue() needs + to have code added for synchronous events which atomically switches the + right from being a thread-based right to a dm_tokdata_t-based right without + allowing the lock to drop in between. You will probably have to add a new + dm_fsys_vector entry point to do this. The lock can't be lost during the + switch, or other threads might change the inode or superblock in between. + Likewise, if you need to return to the filesystem holding a right, then + you need a DMAPI-to-thread atomic switch to occur, most likely in + dm_change_right(). Again, the lock must not be lost during the switch; the + DMAPI spec spends a couple of pages stressing this. Another dm_fsys_vector + entry point is probably the answer. + + There are several assumptions implied in the current layout of the code. + First of all, if an event returns to the filesystem with a return value of + zero, then the filesystem can assume that any locks (rights) held at the + start of the event are still in effect at the end of the event. (Note that + the application could have temporarily dropped and reaquired the right + while the event was outstanding, however). If the event returns to the + filesystem with an errno, then the filesystem must assume that it has lost + any and all rights associated with any of the objects in the event. This + was done for a couple of reasons. First of all, since an errno is being + returned, most likely the filesystem is going to immediately drop all the + locks anyway. If the DMAPI code was required to unconditionally reobtain + all locks before returning to the filesystem, then dm_pending() wouldn't + work for NFS server threads because the process would block indefinitely + trying to get its thread-based rights back, because the DMAPI-rights + associated with the dm_tokdata_t in the outstanding event would prevent + the rights from being obtained. That would be a bad thing. We wouldn't + be able to let users Cntl-C out of read/write/truncate events either. + + If a case should ever surface where the thread has lost its rights even + though it has a zero return status, or where the thread has rights even + though it is returning with an errno, then this logic will have to be + reworked. This could be done by changing the 'right' parameters on all + the event calls to (dm_right_t *), so that they could serve both as IN + and OUT parameters. + + Some events such as DM_EVENT_DESTROY arrive without holding a vnode + reference; if you don't have a vnode reference, you can't have a right + on the file. + + One more quirk. The DM_EVENT_UNMOUNT event is defined to be synchronous + when it's behavior is asynchronous. If an unmount event arrives with + rights, the event should return with the same rights and should NOT leave + any rights in the dm_tokdata_t where the application could use them. +*/ + + +#define GETNEXTOFF(vdat) ((vdat).vd_offset + (vdat).vd_length) +#define HANDLE_SIZE(tdp) \ + ((tdp)->td_type & DM_TDT_VFS ? FSHSIZE : XFS_HSIZE((tdp)->td_handle)) + + +/* Given a vnode pointer in a filesystem known to support DMAPI, + build a tdp structure for the corresponding vnode. +*/ + +static dm_tokdata_t * +dm_vp_data( + vnode_t *vp, + dm_right_t right, + int referenced) /* != 0, caller holds vnode reference */ +{ + int error; + dm_tokdata_t *tdp; + + tdp = kmem_cache_alloc(dm_tokdata_cachep, SLAB_KERNEL); + if (tdp == NULL) { + printk("%s/%d: kmem_cache_alloc(dm_tokdata_cachep) returned NULL\n", __FUNCTION__, __LINE__); + return NULL; + } + + tdp->td_next = NULL; + tdp->td_tevp = NULL; + tdp->td_app_ref = 0; + tdp->td_orig_right = right; + tdp->td_right = right; + tdp->td_flags = DM_TDF_ORIG; + if (referenced) { + tdp->td_flags |= DM_TDF_EVTREF; + } + + if (vp->v_type == VREG) { + tdp->td_type = DM_TDT_REG; + } else if (vp->v_type == VDIR) { + tdp->td_type = DM_TDT_DIR; + } else if (vp->v_type == VLNK) { + tdp->td_type = DM_TDT_LNK; + } else { + tdp->td_type = DM_TDT_OTH; + } + + if (referenced) { + tdp->td_vp = vp; + } else { + tdp->td_vp = NULL; + } + tdp->td_vcount = 0; + + if ((error = dm_vp_to_handle(vp, &tdp->td_handle)) != 0) { + panic("dm_vp_data: dm_vp_to_handle failed for vp %p in " + "a DMAPI filesystem, errno %d\n", vp, error); + } + + return(tdp); +} + + +/* Given a vfs pointer to a filesystem known to support DMAPI, build a tdp + structure for that vfsp. +*/ +static dm_tokdata_t * +dm_vfs_data( + vfs_t *vfsp, + vnode_t *vp, /* will be NULL for DM_EVENT_UNMOUNT */ + dm_right_t right) +{ + dm_tokdata_t *tdp; + + tdp = kmem_cache_alloc(dm_tokdata_cachep, SLAB_KERNEL); + if (tdp == NULL) { + printk("%s/%d: kmem_cache_alloc(dm_tokdata_cachep) returned NULL\n", __FUNCTION__, __LINE__); + return NULL; + } + + tdp->td_next = NULL; + tdp->td_tevp = NULL; + tdp->td_app_ref = 0; + tdp->td_orig_right = right; + tdp->td_right = right; + tdp->td_flags = DM_TDF_ORIG; + if (vp) { + tdp->td_flags |= DM_TDF_EVTREF; + } + tdp->td_type = DM_TDT_VFS; + if (vp) { + tdp->td_vp = vp; + } else { + tdp->td_vp = NULL; + } + tdp->td_vcount = 0; + + bcopy(vfsp->vfs_altfsid, &tdp->td_handle.ha_fsid, sizeof(fsid_t)); + bzero((char *)&tdp->td_handle.ha_fsid + sizeof(fsid_t), + sizeof(tdp->td_handle) - sizeof(fsid_t)); + + return(tdp); +} + + +/* Link a tdp structure into the tevp. */ + +static void +dm_add_handle_to_event( + dm_tokevent_t *tevp, + dm_tokdata_t *tdp) +{ + tdp->td_next = tevp->te_tdp; + tevp->te_tdp = tdp; + tdp->td_tevp = tevp; +} + + +/* Generate the given data event for the vnode, and wait for a reply. The + caller must guarantee that the vnode's reference count is greater than zero + so that the filesystem can't disappear while the request is outstanding. +*/ + +int +dm_send_data_event( + dm_eventtype_t event, + bhv_desc_t *bdp, + dm_right_t vp_right, /* current right for vp */ + off_t offset, + size_t length, + int flags) /* 0 or DM_FLAGS_NDELAY */ +{ + dm_data_event_t *datap; + dm_tokevent_t *tevp; + dm_tokdata_t *tdp; + vnode_t *vp; + int error; + + vp = BHV_TO_VNODE(bdp); + tdp = dm_vp_data(vp, vp_right, /* reference held */ 1); + if (tdp == NULL) + return ENOMEM; + + /* Calculate the size of the event in bytes, create an event structure + for it, and insert the file's handle into the event. + */ + + tevp = dm_evt_create_tevp(event, HANDLE_SIZE(tdp), (void **)&datap); + if (tevp == NULL) { + kmem_cache_free(dm_tokdata_cachep, tdp); + return(ENOMEM); + } + dm_add_handle_to_event(tevp, tdp); + + /* Now fill in all the dm_data_event_t fields. */ + + datap->de_handle.vd_offset = sizeof(*datap); + datap->de_handle.vd_length = HANDLE_SIZE(tdp); + bcopy(&tdp->td_handle, (char *)datap + datap->de_handle.vd_offset, + datap->de_handle.vd_length); + datap->de_offset = offset; + datap->de_length = length; + + /* Queue the message and wait for the reply. */ + + error = dm_enqueue_normal_event(vp->v_vfsp, tevp, flags); + + /* If no errors occurred, we must leave with the same rights we had + upon entry. If errors occurred, we must leave with no rights. + */ + + dm_evt_rele_tevp(tevp, error); + + return(error); +} + + +/* Generate the destroy event for the vnode and wait until the request has been + queued. The caller does not hold a vnode reference or a right on the vnode, + but it must otherwise lock down the vnode such that the filesystem can't + disappear while the request is waiting to be queued. While waiting to be + queued, the vnode must not be referenceable either by path or by a call + to dm_handle_to_vp(). +*/ + +int +dm_send_destroy_event( + bhv_desc_t *bdp, + dm_right_t vp_right) /* always DM_RIGHT_NULL */ +{ + dm_fsys_vector_t *fsys_vector; + dm_tokevent_t *tevp; + dm_tokdata_t *tdp; + dm_destroy_event_t *destp; + dm_attrname_t attrname; + vnode_t *vp; + char *value; + int value_len; + int error; + + vp = BHV_TO_VNODE(bdp); + tdp = dm_vp_data(vp, vp_right, /* no reference held */ 0); + if (tdp == NULL) + return ENOMEM; + + if ((error = dm_waitfor_destroy_attrname(vp->v_vfsp, &attrname)) != 0) + return(error); + + /* If a return-on-destroy attribute name exists for this filesystem, + see if the object being deleted has this attribute. If the object + doesn't have the attribute or if we encounter an error, then send + the event without the attribute. + */ + + value_len = -1; /* because zero is a valid attribute length */ + if (attrname.an_chars[0] != '\0') { + fsys_vector = dm_fsys_vector(vp); + error = fsys_vector->get_destroy_dmattr(vp, vp_right, &attrname, + &value, &value_len); + if (error) + return error; + } + + /* Now that we know the size of the attribute value, if any, calculate + the size of the event in bytes, create an event structure for it, + and insert the handle into the event. + */ + + tevp = dm_evt_create_tevp(DM_EVENT_DESTROY, + HANDLE_SIZE(tdp) + (value_len >= 0 ? value_len : 0), + (void **)&destp); + if (tevp == NULL) { + kmem_cache_free(dm_tokdata_cachep, tdp); + if (value_len > 0) + kfree(value); + return(ENOMEM); + } + dm_add_handle_to_event(tevp, tdp); + + /* Now fill in all the dm_destroy_event_t fields. */ + + destp->ds_handle.vd_offset = sizeof(*destp); + destp->ds_handle.vd_length = HANDLE_SIZE(tdp); + bcopy(&tdp->td_handle, (char *)destp + destp->ds_handle.vd_offset, + destp->ds_handle.vd_length); + if (value_len >= 0) { + destp->ds_attrname = attrname; + destp->ds_attrcopy.vd_length = value_len; + if (value_len == 0) { + destp->ds_attrcopy.vd_offset = 0; + } else { + destp->ds_attrcopy.vd_offset = GETNEXTOFF(destp->ds_handle); + bcopy(value, (char *)destp + destp->ds_attrcopy.vd_offset, + value_len); + kfree(value); + } + } + + /* Queue the message asynchronously. */ + + error = dm_enqueue_normal_event(vp->v_vfsp, tevp, 0); + + /* Since we had no rights upon entry, we have none to reobtain before + leaving. + */ + + dm_evt_rele_tevp(tevp, 1); + + return(error); +} + + +/* The dm_mount_event_t event is sent in turn to all sessions that have asked + for it until one either rejects it or accepts it. The filesystem is not + going anywhere because the mount is blocked until the event is answered. +*/ + +int +dm_send_mount_event( + vfs_t *vfsp, /* filesystem being mounted */ + dm_right_t vfsp_right, + bhv_desc_t *bdp, /* mounted on directory */ + dm_right_t vp_right, + bhv_desc_t *rootbdp, + dm_right_t rootvp_right, + char *name1, /* mount path */ + char *name2) /* filesystem device name */ +{ + int error; + dm_tokevent_t *tevp = NULL; + dm_tokdata_t *tdp1 = NULL; /* filesystem handle for event */ + dm_tokdata_t *tdp2 = NULL; /* file handle for mounted-on dir. */ + dm_tokdata_t *tdp3 = NULL; /* file handle for root vnode */ + dm_mount_event_t *mp; + size_t nextoff; + vnode_t *vp = NULL; /* mounted on directory */ + vnode_t *rootvp = BHV_TO_VNODE(rootbdp); + + /* Convert the vfsp to a filesystem handle, and vp and rootvp into + file handles. vp (the mounted-on directory) may not have a handle + if it is a different filesystem type such as EFS which does not + support DMAPI. + */ + + if(bdp) + vp = BHV_TO_VNODE(bdp); + + tdp1 = dm_vfs_data(vfsp, rootvp, vfsp_right); + if (tdp1 == NULL) + goto out_nomem; + + if ((vp == NULL) || dm_check_dmapi_vp(vp)) { + vp = NULL; /* assume we are mounting on non XFS */ + } else { + tdp2 = dm_vp_data(vp, vp_right, /* reference held */ 1); + if (tdp2 == NULL) + goto out_nomem; + } + + tdp3 = dm_vp_data(rootvp, rootvp_right, /* reference held */ 1); + if (tdp3 == NULL) + goto out_nomem; + + /* Calculate the size of the event in bytes, create an event structure + for it, and insert the handles into the event. + */ + + tevp = dm_evt_create_tevp(DM_EVENT_MOUNT, + HANDLE_SIZE(tdp1) + (vp ? HANDLE_SIZE(tdp2) : 0) + + HANDLE_SIZE(tdp3) + strlen(name1) + 1 + + strlen(name2) + 1, (void **)&mp); + if (tevp == NULL) + goto out_nomem; + + dm_add_handle_to_event(tevp, tdp1); + if (vp) + dm_add_handle_to_event(tevp, tdp2); + dm_add_handle_to_event(tevp, tdp3); + + /* Now fill in all the dm_mount_event_t fields. */ + + mp->me_handle1.vd_offset = sizeof(*mp); + mp->me_handle1.vd_length = HANDLE_SIZE(tdp1); + bcopy(&tdp1->td_handle, (char *) mp + mp->me_handle1.vd_offset, + mp->me_handle1.vd_length); + nextoff = GETNEXTOFF(mp->me_handle1); + + if (vp) { + mp->me_handle2.vd_offset = nextoff; + mp->me_handle2.vd_length = HANDLE_SIZE(tdp2); + bcopy(&tdp2->td_handle, (char *)mp + mp->me_handle2.vd_offset, + mp->me_handle2.vd_length); + nextoff = GETNEXTOFF(mp->me_handle2); + } + + mp->me_name1.vd_offset = nextoff; + mp->me_name1.vd_length = strlen(name1) + 1; + bcopy(name1, (char *)mp + mp->me_name1.vd_offset, mp->me_name1.vd_length); + nextoff = GETNEXTOFF(mp->me_name1); + + mp->me_name2.vd_offset = nextoff; + mp->me_name2.vd_length = strlen(name2) + 1; + bcopy(name2, (char *)mp + mp->me_name2.vd_offset, mp->me_name2.vd_length); + nextoff = GETNEXTOFF(mp->me_name2); + + mp->me_roothandle.vd_offset = nextoff; + mp->me_roothandle.vd_length = HANDLE_SIZE(tdp3); + bcopy(&tdp3->td_handle, (char *)mp + mp->me_roothandle.vd_offset, + mp->me_roothandle.vd_length); + + mp->me_mode = (vfsp->vfs_flag & VFS_RDONLY ? DM_MOUNT_RDONLY : 0); + + /* Queue the message and wait for the reply. */ + + error = dm_enqueue_mount_event(vfsp, tevp); + + /* If no errors occurred, we must leave with the same rights we had + upon entry. If errors occurred, we must leave with no rights. + */ + + dm_evt_rele_tevp(tevp, error); + + return(error); + +out_nomem: + if (tevp) + kfree(tevp); + if (tdp1) + kmem_cache_free(dm_tokdata_cachep, tdp1); + if (tdp2) + kmem_cache_free(dm_tokdata_cachep, tdp2); + if (tdp3) + kmem_cache_free(dm_tokdata_cachep, tdp3); + return ENOMEM; +} + + +/* Generate an DM_EVENT_UNMOUNT event and wait for a reply. The 'retcode' + field indicates whether this is a successful or unsuccessful unmount. + If successful, the filesystem is already unmounted, and any pending handle + reference to the filesystem will be failed. If the unmount was + unsuccessful, then the filesystem will be placed back into full service. + + The DM_EVENT_UNMOUNT event should really be asynchronous, because the + application has no control over whether or not the unmount succeeds. (The + DMAPI spec defined it that way because asynchronous events aren't always + guaranteed to be delivered.) + + Since the filesystem is already unmounted in the successful case, the + DM_EVENT_UNMOUNT event can't make available any vnode to be used in + subsequent sid/hanp/hlen/token calls by the application. The event will + hang around until the application does a DM_RESP_CONTINUE, but the handle + within the event is unusable by the application. +*/ + +void +dm_send_unmount_event( + vfs_t *vfsp, + vnode_t *vp, /* NULL if unmount successful */ + dm_right_t vfsp_right, + mode_t mode, + int retcode, /* errno, if unmount failed */ + int flags) +{ + dm_namesp_event_t *np; + dm_tokevent_t *tevp; + dm_tokdata_t *tdp1; + + /* If the unmount failed, put the filesystem back into full service, + allowing blocked handle references to finish. If it succeeded, put + the filesystem into the DM_STATE_UNMOUNTED state and fail all + blocked DM_NO_TOKEN handle accesses. + */ + + if (retcode != 0) { /* unmount was unsuccessful */ + dm_change_fsys_entry(vfsp, DM_STATE_MOUNTED); + } else { + dm_change_fsys_entry(vfsp, DM_STATE_UNMOUNTED); + } + + /* If the event wasn't in the filesystem dm_eventset_t, just remove + the filesystem from the list of DMAPI filesystems and return. + */ + + if (flags & DM_FLAGS_UNWANTED) { + if (retcode == 0) + dm_remove_fsys_entry(vfsp); + return; + } + + /* Calculate the size of the event in bytes and allocate zeroed memory + for it. + */ + + tdp1 = dm_vfs_data(vfsp, vp, vfsp_right); + if (tdp1 == NULL) + return; + + tevp = dm_evt_create_tevp(DM_EVENT_UNMOUNT, HANDLE_SIZE(tdp1), + (void **)&np); + if (tevp == NULL) { + kmem_cache_free(dm_tokdata_cachep, tdp1); + return; + } + + dm_add_handle_to_event(tevp, tdp1); + + /* Now copy in all the dm_namesp_event_t specific fields. */ + + np->ne_handle1.vd_offset = sizeof(*np); + np->ne_handle1.vd_length = HANDLE_SIZE(tdp1); + bcopy(&tdp1->td_handle, (char *) np + np->ne_handle1.vd_offset, + np->ne_handle1.vd_length); + np->ne_mode = mode; + np->ne_retcode = retcode; + + /* Since DM_EVENT_UNMOUNT is effectively asynchronous, queue the + message and ignore any error return for DM_EVENT_UNMOUNT. + */ + + (void)dm_enqueue_normal_event(vfsp, tevp, flags); + + if (retcode == 0) + dm_remove_fsys_entry(vfsp); + + dm_evt_rele_tevp(tevp, 0); +} + + +/* Generate the given namespace event and wait for a reply (if synchronous) or + until the event has been queued (asynchronous). The caller must guarantee + that at least one vnode within the filesystem has had its reference count + bumped so that the filesystem can't disappear while the event is + outstanding. +*/ + +int +dm_send_namesp_event( + dm_eventtype_t event, + bhv_desc_t *bdp1, + dm_right_t vp1_right, + bhv_desc_t *bdp2, + dm_right_t vp2_right, + char *name1, + char *name2, + mode_t mode, + int retcode, + int flags) +{ + dm_namesp_event_t *np; + dm_tokevent_t *tevp; + dm_tokdata_t *tdp1 = NULL; /* primary handle for event */ + dm_tokdata_t *tdp2 = NULL; /* additional handle for event */ + vfs_t *sidvfsp; /* vfs event must be registered on */ + size_t nextoff; + int error; + vnode_t *vp1; + + vp1 = BHV_TO_VNODE(bdp1); + sidvfsp = vp1->v_vfsp; + + switch (event) { + case DM_EVENT_PREUNMOUNT: + /* + * PREUNMOUNT - Send the file system handle in handle1, + * and the handle for the root dir in the second. Otherwise + * it's a normal sync message; i.e. succeeds or fails + * depending on the app's return code. + * bdp1 and bdp2 are both the root dir of mounted FS + * vp1_right is the filesystem right. + * vp2_right is the root inode right. + */ + + tdp1 = dm_vfs_data(sidvfsp, vp1, vp1_right); + if (tdp1 == NULL) + return ENOMEM; + tdp2 = dm_vp_data(BHV_TO_VNODE(bdp2), vp2_right, /* reference held */ 1); + if (tdp2 == NULL) { + kmem_cache_free(dm_tokdata_cachep, tdp1); + return ENOMEM; + } + + if (flags & DM_FLAGS_UNWANTED) { + dm_change_fsys_entry(sidvfsp, DM_STATE_UNMOUNTING); + return(0); + } + break; + + case DM_EVENT_NOSPACE: + /* vp1_right is the filesystem right. */ + + tdp1 = dm_vfs_data(sidvfsp, vp1, vp1_right); + if (tdp1 == NULL) + return ENOMEM; + tdp2 = dm_vp_data(BHV_TO_VNODE(bdp2), vp2_right, /* reference held */ 1); /* additional info - not in the spec */ + if (tdp2 == NULL) { + kmem_cache_free(dm_tokdata_cachep, tdp1); + return ENOMEM; + } + break; + + default: + /* All other events only pass in vnodes and don't require any + special cases. + */ + + tdp1 = dm_vp_data(vp1, vp1_right, /* reference held */ 1); + if (tdp1 == NULL) + return ENOMEM; + if (bdp2) { + tdp2 = dm_vp_data(BHV_TO_VNODE(bdp2), vp2_right, /* reference held */ 1); + if (tdp2 == NULL) { + kmem_cache_free(dm_tokdata_cachep, tdp1); + return ENOMEM; + } + } + } + + /* Calculate the size of the event in bytes and allocate zeroed memory + for it. + */ + + tevp = dm_evt_create_tevp(event, + HANDLE_SIZE(tdp1) + (bdp2 ? HANDLE_SIZE(tdp2) : 0) + + (name1 ? strlen(name1) + 1 : 0) + + (name2 ? strlen(name2) + 1 : 0), (void **)&np); + if (tevp == NULL) { + if (tdp1) + kmem_cache_free(dm_tokdata_cachep, tdp1); + if (tdp2) + kmem_cache_free(dm_tokdata_cachep, tdp2); + return(ENOMEM); + } + + dm_add_handle_to_event(tevp, tdp1); + if (bdp2) + dm_add_handle_to_event(tevp, tdp2); + + /* Now copy in all the dm_namesp_event_t specific fields. */ + + np->ne_handle1.vd_offset = sizeof(*np); + np->ne_handle1.vd_length = HANDLE_SIZE(tdp1); + bcopy(&tdp1->td_handle, (char *) np + np->ne_handle1.vd_offset, + np->ne_handle1.vd_length); + nextoff = GETNEXTOFF(np->ne_handle1); + if (bdp2) { + np->ne_handle2.vd_offset = nextoff; + np->ne_handle2.vd_length = HANDLE_SIZE(tdp2); + bcopy(&tdp2->td_handle, (char *)np + np->ne_handle2.vd_offset, + np->ne_handle2.vd_length); + nextoff = GETNEXTOFF(np->ne_handle2); + } + if (name1) { + np->ne_name1.vd_offset = nextoff; + np->ne_name1.vd_length = strlen(name1) + 1; + bcopy(name1, (char *)np + np->ne_name1.vd_offset, + np->ne_name1.vd_length); + nextoff = GETNEXTOFF(np->ne_name1); + } + if (name2) { + np->ne_name2.vd_offset = nextoff; + np->ne_name2.vd_length = strlen(name2) + 1; + bcopy(name2, (char *)np + np->ne_name2.vd_offset, + np->ne_name2.vd_length); + } + np->ne_mode = mode; + np->ne_retcode = retcode; + + /* Queue the message and wait for the reply. */ + + error = dm_enqueue_normal_event(sidvfsp, tevp, flags); + + /* If no errors occurred, we must leave with the same rights we had + upon entry. If errors occurred, we must leave with no rights. + */ + + dm_evt_rele_tevp(tevp, error); + + if (!error && event == DM_EVENT_PREUNMOUNT) { + dm_change_fsys_entry(sidvfsp, DM_STATE_UNMOUNTING); + } + + return(error); +} + + +/* + * Send a message of type "DM_EVENT_USER". Since no vnode is involved, we + * don't have to worry about rights here. + */ + +int +dm_send_msg( + dm_sessid_t targetsid, + dm_msgtype_t msgtype, /* SYNC or ASYNC */ + size_t buflen, + void *bufp) +{ + dm_tokevent_t *tevp; + int sync; + void *msgp; + int error; + + if (buflen > DM_MAX_MSG_DATA) + return(E2BIG); + if (msgtype == DM_MSGTYPE_ASYNC) { + sync = 0; + } else if (msgtype == DM_MSGTYPE_SYNC) { + sync = 1; + } else { + return(EINVAL); + } + + tevp = dm_evt_create_tevp(DM_EVENT_USER, buflen, (void **)&msgp); + if (tevp == NULL) + return ENOMEM; + + if (buflen && copy_from_user(msgp, bufp, buflen)) { + dm_evt_rele_tevp(tevp, 0); + return(EFAULT); + } + + /* Enqueue the request and wait for the reply. */ + + error = dm_enqueue_sendmsg_event(targetsid, tevp, sync); + + /* Destroy the tevp and return the reply. (dm_pending is not + supported here.) + */ + + dm_evt_rele_tevp(tevp, error); + + return(error); +} + + +/* + * Send a message of type "DM_EVENT_USER". Since no vnode is involved, we + * don't have to worry about rights here. + */ + +int +dm_create_userevent( + dm_sessid_t sid, + size_t msglen, + void *msgdatap, + dm_token_t *tokenp) /* return token created */ +{ + dm_tokevent_t *tevp; + dm_token_t token; + int error; + void *msgp; + + if (msglen > DM_MAX_MSG_DATA) + return(E2BIG); + + tevp = dm_evt_create_tevp(DM_EVENT_USER, msglen, (void **)&msgp); + if (tevp == NULL) + return(ENOMEM); + + if (msglen && copy_from_user(msgp, msgdatap, msglen)) { + dm_evt_rele_tevp(tevp, 0); + return(EFAULT); + } + + /* Queue the message. If that didn't work, free the tevp structure. */ + + if ((error = dm_enqueue_user_event(sid, tevp, &token)) != 0) + dm_evt_rele_tevp(tevp, 0); + + if (!error && copy_to_user(tokenp, &token, sizeof(token))) + error = EFAULT; + + return(error); +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/dmapi/dmapi_handle.c linux-2.4-xfs/fs/xfs/dmapi/dmapi_handle.c --- linux-2.4.19/fs/xfs/dmapi/dmapi_handle.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/dmapi/dmapi_handle.c Tue Jul 30 18:08:15 2002 @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "dmapi_private.h" + + +int +dm_create_by_handle( + dm_sessid_t sid, + void *dirhanp, + size_t dirhlen, + dm_token_t token, + void *hanp, + size_t hlen, + char *cname) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp(sid, dirhanp, dirhlen, token, DM_TDT_DIR, + DM_RIGHT_EXCL, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->create_by_handle(tdp->td_vp, tdp->td_right, + hanp, hlen, cname); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} + + +int +dm_mkdir_by_handle( + dm_sessid_t sid, + void *dirhanp, + size_t dirhlen, + dm_token_t token, + void *hanp, + size_t hlen, + char *cname) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp(sid, dirhanp, dirhlen, token, DM_TDT_DIR, + DM_RIGHT_EXCL, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->mkdir_by_handle(tdp->td_vp, tdp->td_right, + hanp, hlen, cname); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} + + +int +dm_symlink_by_handle( + dm_sessid_t sid, + void *dirhanp, + size_t dirhlen, + dm_token_t token, + void *hanp, + size_t hlen, + char *cname, + char *path) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp(sid, dirhanp, dirhlen, token, DM_TDT_DIR, + DM_RIGHT_EXCL, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->symlink_by_handle(tdp->td_vp, tdp->td_right, + hanp, hlen, cname, path); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/dmapi/dmapi_hole.c linux-2.4-xfs/fs/xfs/dmapi/dmapi_hole.c --- linux-2.4.19/fs/xfs/dmapi/dmapi_hole.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/dmapi/dmapi_hole.c Tue Jul 30 18:08:15 2002 @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "dmapi_private.h" + + +int +dm_get_allocinfo_rvp( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_off_t *offp, + u_int nelem, + dm_extent_t *extentp, + u_int *nelemp, + int *rvp) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_REG, + DM_RIGHT_SHARED, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->get_allocinfo_rvp(tdp->td_vp, tdp->td_right, + offp, nelem, extentp, nelemp, rvp); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} + + +int +dm_probe_hole( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_off_t off, + dm_size_t len, + dm_off_t *roffp, + dm_size_t *rlenp) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_REG, + DM_RIGHT_SHARED, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->probe_hole(tdp->td_vp, tdp->td_right, + off, len, roffp, rlenp); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} + + +int +dm_punch_hole( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_off_t off, + dm_size_t len) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_REG, + DM_RIGHT_EXCL, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->punch_hole(tdp->td_vp, tdp->td_right, off, len); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/dmapi/dmapi_io.c linux-2.4-xfs/fs/xfs/dmapi/dmapi_io.c --- linux-2.4.19/fs/xfs/dmapi/dmapi_io.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/dmapi/dmapi_io.c Tue Jul 30 18:08:15 2002 @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "dmapi_private.h" + + +int +dm_read_invis_rvp( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_off_t off, + dm_size_t len, + void *bufp, + int *rvp) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_REG, + DM_RIGHT_SHARED, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->read_invis_rvp(tdp->td_vp, tdp->td_right, + off, len, bufp, rvp); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} + + +int +dm_write_invis_rvp( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + int flags, + dm_off_t off, + dm_size_t len, + void *bufp, + int *rvp) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_REG, + DM_RIGHT_EXCL, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->write_invis_rvp(tdp->td_vp, tdp->td_right, + flags, off, len, bufp, rvp); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} + + +int +dm_sync_by_handle ( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_REG, + DM_RIGHT_EXCL, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->sync_by_handle(tdp->td_vp, tdp->td_right); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} + + +int +dm_get_dioinfo ( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_dioinfo_t *diop) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_REG, + DM_RIGHT_SHARED, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->get_dioinfo(tdp->td_vp, tdp->td_right, diop); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/dmapi/dmapi_kern.h linux-2.4-xfs/fs/xfs/dmapi/dmapi_kern.h --- linux-2.4.19/fs/xfs/dmapi/dmapi_kern.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/dmapi/dmapi_kern.h Tue Jul 30 18:09:44 2002 @@ -0,0 +1,563 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#ifndef __DMAPI_KERN_H__ +#define __DMAPI_KERN_H__ + + +union sys_dmapi_uarg { + void *p; + __u64 u; +}; +typedef union sys_dmapi_uarg sys_dmapi_u; + +struct sys_dmapi_args { + sys_dmapi_u uarg1, uarg2, uarg3, uarg4, uarg5, uarg6, uarg7, uarg8, + uarg9, uarg10, uarg11; +}; +typedef struct sys_dmapi_args sys_dmapi_args_t; + +#define DM_Uarg(uap,i) uap->uarg##i.u +#define DM_Parg(uap,i) uap->uarg##i.p + +#ifdef __KERNEL__ + +struct xfs_handle_t; + +/* The first group of definitions and prototypes define the filesystem's + interface into the DMAPI code. +*/ + + +/* Definitions used for the flags field on dm_send_data_event(), + dm_send_unmount_event(), and dm_send_namesp_event() calls. +*/ + +#define DM_FLAGS_NDELAY 0x001 /* return EAGAIN after dm_pending() */ +#define DM_FLAGS_UNWANTED 0x002 /* event not in fsys dm_eventset_t */ + +/* Possible code levels reported by dm_code_level(). */ + +#define DM_CLVL_INIT 0 /* DMAPI prior to X/Open compliance */ +#define DM_CLVL_XOPEN 1 /* X/Open compliant DMAPI */ + + +/* Prototypes used outside of the DMI module/directory. */ + +int dm_send_data_event( + dm_eventtype_t event, + struct bhv_desc *bdp, + dm_right_t vp_right, + off_t off, + size_t len, + int flags); + +int dm_send_destroy_event( + struct bhv_desc *bdp, + dm_right_t vp_right); + +int dm_send_mount_event( + struct vfs *vfsp, + dm_right_t vfsp_right, + struct bhv_desc *bdp, + dm_right_t vp_right, + struct bhv_desc *rootbdp, + dm_right_t rootvp_right, + char *name1, + char *name2); + +int dm_send_namesp_event( + dm_eventtype_t event, + struct bhv_desc *bdp1, + dm_right_t vp1_right, + struct bhv_desc *bdp2, + dm_right_t vp2_right, + char *name1, + char *name2, + mode_t mode, + int retcode, + int flags); + +void dm_send_unmount_event( + struct vfs *vfsp, + struct vnode *vp, + dm_right_t vfsp_right, + mode_t mode, + int retcode, + int flags); + +int dm_code_level(void); + +int dm_vp_to_handle ( + struct vnode *vp, + xfs_handle_t *handlep); + +/* The following prototypes and definitions are used by DMAPI as its + interface into the filesystem code. Communication between DMAPI and the + filesystem are established as follows: + 1. DMAPI uses the VFS_DMAPI_FSYS_VECTOR to ask for the addresses + of all the functions within the filesystem that it may need to call. + 2. The filesystem returns an array of function name/address pairs which + DMAPI builds into a function vector. + The VFS_DMAPI_FSYS_VECTOR call is only made one time for a particular + filesystem type. From then on, DMAPI uses its function vector to call the + filesystem functions directly. Functions in the array which DMAPI doesn't + recognize are ignored. A dummy function which returns ENOSYS is used for + any function that DMAPI needs but which was not provided by the filesystem. + If XFS doesn't recognize the VFS_DMAPI_FSYS_VECTOR, DMAPI assumes that it + doesn't have the X/Open support code; in this case DMAPI uses the XFS-code + originally bundled within DMAPI. + + The goal of this interface is allow incremental changes to be made to + both the filesystem and to DMAPI while minimizing inter-patch dependencies, + and to eventually allow DMAPI to support multiple filesystem types at the + same time should that become necessary. +*/ + +typedef enum { + DM_FSYS_CLEAR_INHERIT = 0, + DM_FSYS_CREATE_BY_HANDLE = 1, + DM_FSYS_DOWNGRADE_RIGHT = 2, + DM_FSYS_GET_ALLOCINFO_RVP = 3, + DM_FSYS_GET_BULKALL_RVP = 4, + DM_FSYS_GET_BULKATTR_RVP = 5, + DM_FSYS_GET_CONFIG = 6, + DM_FSYS_GET_CONFIG_EVENTS = 7, + DM_FSYS_GET_DESTROY_DMATTR = 8, + DM_FSYS_GET_DIOINFO = 9, + DM_FSYS_GET_DIRATTRS_RVP = 10, + DM_FSYS_GET_DMATTR = 11, + DM_FSYS_GET_EVENTLIST = 12, + DM_FSYS_GET_FILEATTR = 13, + DM_FSYS_GET_REGION = 14, + DM_FSYS_GETALL_DMATTR = 15, + DM_FSYS_GETALL_INHERIT = 16, + DM_FSYS_INIT_ATTRLOC = 17, + DM_FSYS_MKDIR_BY_HANDLE = 18, + DM_FSYS_PROBE_HOLE = 19, + DM_FSYS_PUNCH_HOLE = 20, + DM_FSYS_READ_INVIS_RVP = 21, + DM_FSYS_RELEASE_RIGHT = 22, + DM_FSYS_REMOVE_DMATTR = 23, + DM_FSYS_REQUEST_RIGHT = 24, + DM_FSYS_SET_DMATTR = 25, + DM_FSYS_SET_EVENTLIST = 26, + DM_FSYS_SET_FILEATTR = 27, + DM_FSYS_SET_INHERIT = 28, + DM_FSYS_SET_REGION = 29, + DM_FSYS_SYMLINK_BY_HANDLE = 30, + DM_FSYS_SYNC_BY_HANDLE = 31, + DM_FSYS_UPGRADE_RIGHT = 32, + DM_FSYS_WRITE_INVIS_RVP = 33, + DM_FSYS_OBJ_REF_HOLD = 34, + DM_FSYS_MAX = 35 +} dm_fsys_switch_t; + + +#define DM_FSYS_OBJ 0x1 /* object refers to a fsys handle */ + + +/* + * Prototypes for filesystem-specific functions. + */ + +typedef int (*dm_fsys_clear_inherit_t)( + vnode_t *vp, + dm_right_t right, + dm_attrname_t *attrnamep); + +typedef int (*dm_fsys_create_by_handle_t)( + vnode_t *vp, + dm_right_t right, + void *hanp, + size_t hlen, + char *cname); + +typedef int (*dm_fsys_downgrade_right_t)( + vnode_t *vp, + dm_right_t right, + u_int type); /* DM_FSYS_OBJ or zero */ + +typedef int (*dm_fsys_get_allocinfo_rvp_t)( + vnode_t *vp, + dm_right_t right, + dm_off_t *offp, + u_int nelem, + dm_extent_t *extentp, + u_int *nelemp, + int *rvalp); + +typedef int (*dm_fsys_get_bulkall_rvp_t)( + vnode_t *vp, /* root vnode */ + dm_right_t right, + u_int mask, + dm_attrname_t *attrnamep, + dm_attrloc_t *locp, + size_t buflen, + void *bufp, + size_t *rlenp, + int *rvalp); + +typedef int (*dm_fsys_get_bulkattr_rvp_t)( + vnode_t *vp, /* root vnode */ + dm_right_t right, + u_int mask, + dm_attrloc_t *locp, + size_t buflen, + void *bufp, + size_t *rlenp, + int *rvalp); + +typedef int (*dm_fsys_get_config_t)( + vnode_t *vp, + dm_right_t right, + dm_config_t flagname, + dm_size_t *retvalp); + +typedef int (*dm_fsys_get_config_events_t)( + vnode_t *vp, + dm_right_t right, + u_int nelem, + dm_eventset_t *eventsetp, + u_int *nelemp); + +typedef int (*dm_fsys_get_destroy_dmattr_t)( + vnode_t *vp, + dm_right_t right, + dm_attrname_t *attrnamep, + char **valuepp, + int *vlenp); + +typedef int (*dm_fsys_get_dioinfo_t)( + vnode_t *vp, + dm_right_t right, + dm_dioinfo_t *diop); + +typedef int (*dm_fsys_get_dirattrs_rvp_t)( + vnode_t *vp, + dm_right_t right, + u_int mask, + dm_attrloc_t *locp, + size_t buflen, + void *bufp, + size_t *rlenp, + int *rvalp); + +typedef int (*dm_fsys_get_dmattr_t)( + vnode_t *vp, + dm_right_t right, + dm_attrname_t *attrnamep, + size_t buflen, + void *bufp, + size_t *rlenp); + +typedef int (*dm_fsys_get_eventlist_t)( + vnode_t *vp, + dm_right_t right, + u_int type, + u_int nelem, + dm_eventset_t *eventsetp, /* in kernel space! */ + u_int *nelemp); /* in kernel space! */ + +typedef int (*dm_fsys_get_fileattr_t)( + vnode_t *vp, + dm_right_t right, + u_int mask, + dm_stat_t *statp); + +typedef int (*dm_fsys_get_region_t)( + vnode_t *vp, + dm_right_t right, + u_int nelem, + dm_region_t *regbufp, + u_int *nelemp); + +typedef int (*dm_fsys_getall_dmattr_t)( + vnode_t *vp, + dm_right_t right, + size_t buflen, + void *bufp, + size_t *rlenp); + +typedef int (*dm_fsys_getall_inherit_t)( + vnode_t *vp, + dm_right_t right, + u_int nelem, + dm_inherit_t *inheritbufp, + u_int *nelemp); + +typedef int (*dm_fsys_init_attrloc_t)( + vnode_t *vp, /* sometimes root vnode */ + dm_right_t right, + dm_attrloc_t *locp); + +typedef int (*dm_fsys_mkdir_by_handle_t)( + vnode_t *vp, + dm_right_t right, + void *hanp, + size_t hlen, + char *cname); + +typedef int (*dm_fsys_probe_hole_t)( + vnode_t *vp, + dm_right_t right, + dm_off_t off, + dm_size_t len, + dm_off_t *roffp, + dm_size_t *rlenp); + +typedef int (*dm_fsys_punch_hole_t)( + vnode_t *vp, + dm_right_t right, + dm_off_t off, + dm_size_t len); + +typedef int (*dm_fsys_read_invis_rvp_t)( + vnode_t *vp, + dm_right_t right, + dm_off_t off, + dm_size_t len, + void *bufp, + int *rvp); + +typedef int (*dm_fsys_release_right_t)( + vnode_t *vp, + dm_right_t right, + u_int type); + +typedef int (*dm_fsys_remove_dmattr_t)( + vnode_t *vp, + dm_right_t right, + int setdtime, + dm_attrname_t *attrnamep); + +typedef int (*dm_fsys_request_right_t)( + vnode_t *vp, + dm_right_t right, + u_int type, /* DM_FSYS_OBJ or zero */ + u_int flags, + dm_right_t newright); + +typedef int (*dm_fsys_set_dmattr_t)( + vnode_t *vp, + dm_right_t right, + dm_attrname_t *attrnamep, + int setdtime, + size_t buflen, + void *bufp); + +typedef int (*dm_fsys_set_eventlist_t)( + vnode_t *vp, + dm_right_t right, + u_int type, + dm_eventset_t *eventsetp, /* in kernel space! */ + u_int maxevent); + +typedef int (*dm_fsys_set_fileattr_t)( + vnode_t *vp, + dm_right_t right, + u_int mask, + dm_fileattr_t *attrp); + +typedef int (*dm_fsys_set_inherit_t)( + vnode_t *vp, + dm_right_t right, + dm_attrname_t *attrnamep, + mode_t mode); + +typedef int (*dm_fsys_set_region_t)( + vnode_t *vp, + dm_right_t right, + u_int nelem, + dm_region_t *regbufp, + dm_boolean_t *exactflagp); + +typedef int (*dm_fsys_symlink_by_handle_t)( + vnode_t *vp, + dm_right_t right, + void *hanp, + size_t hlen, + char *cname, + char *path); + +typedef int (*dm_fsys_sync_by_handle_t)( + vnode_t *vp, + dm_right_t right); + +typedef int (*dm_fsys_upgrade_right_t)( + vnode_t *vp, + dm_right_t right, + u_int type); /* DM_FSYS_OBJ or zero */ + +typedef int (*dm_fsys_write_invis_rvp_t)( + vnode_t *vp, + dm_right_t right, + int flags, + dm_off_t off, + dm_size_t len, + void *bufp, + int *rvp); + +typedef void (*dm_fsys_obj_ref_hold_t)( + vnode_t *vp); + + +/* Structure definitions used by the VFS_DMAPI_FSYS_VECTOR call. */ + +typedef struct { + dm_fsys_switch_t func_no; /* function number */ + union { + dm_fsys_clear_inherit_t clear_inherit; + dm_fsys_create_by_handle_t create_by_handle; + dm_fsys_downgrade_right_t downgrade_right; + dm_fsys_get_allocinfo_rvp_t get_allocinfo_rvp; + dm_fsys_get_bulkall_rvp_t get_bulkall_rvp; + dm_fsys_get_bulkattr_rvp_t get_bulkattr_rvp; + dm_fsys_get_config_t get_config; + dm_fsys_get_config_events_t get_config_events; + dm_fsys_get_destroy_dmattr_t get_destroy_dmattr; + dm_fsys_get_dioinfo_t get_dioinfo; + dm_fsys_get_dirattrs_rvp_t get_dirattrs_rvp; + dm_fsys_get_dmattr_t get_dmattr; + dm_fsys_get_eventlist_t get_eventlist; + dm_fsys_get_fileattr_t get_fileattr; + dm_fsys_get_region_t get_region; + dm_fsys_getall_dmattr_t getall_dmattr; + dm_fsys_getall_inherit_t getall_inherit; + dm_fsys_init_attrloc_t init_attrloc; + dm_fsys_mkdir_by_handle_t mkdir_by_handle; + dm_fsys_probe_hole_t probe_hole; + dm_fsys_punch_hole_t punch_hole; + dm_fsys_read_invis_rvp_t read_invis_rvp; + dm_fsys_release_right_t release_right; + dm_fsys_remove_dmattr_t remove_dmattr; + dm_fsys_request_right_t request_right; + dm_fsys_set_dmattr_t set_dmattr; + dm_fsys_set_eventlist_t set_eventlist; + dm_fsys_set_fileattr_t set_fileattr; + dm_fsys_set_inherit_t set_inherit; + dm_fsys_set_region_t set_region; + dm_fsys_symlink_by_handle_t symlink_by_handle; + dm_fsys_sync_by_handle_t sync_by_handle; + dm_fsys_upgrade_right_t upgrade_right; + dm_fsys_write_invis_rvp_t write_invis_rvp; + dm_fsys_obj_ref_hold_t obj_ref_hold; + } u_fc; +} fsys_function_vector_t; + +struct dm_fcntl_vector { + int code_level; + int count; /* Number of functions in the vector */ + fsys_function_vector_t *vecp; +}; +typedef struct dm_fcntl_vector dm_fcntl_vector_t; + +struct dm_fcntl_mapevent { + size_t length; /* length of transfer */ + dm_eventtype_t max_event; /* Maximum (WRITE or READ) event */ + int error; /* returned error code */ +}; +typedef struct dm_fcntl_mapevent dm_fcntl_mapevent_t; + +#endif /* __KERNEL__ */ + + +/* The following definitions are needed both by the kernel and by the + library routines. +*/ + +#define DM_MAX_HANDLE_SIZE 56 /* maximum size for a file handle */ + + +/* + * Opcodes for dmapi ioctl. + */ + +#define DM_CLEAR_INHERIT 1 +#define DM_CREATE_BY_HANDLE 2 +#define DM_CREATE_SESSION 3 +#define DM_CREATE_USEREVENT 4 +#define DM_DESTROY_SESSION 5 +#define DM_DOWNGRADE_RIGHT 6 +#define DM_FD_TO_HANDLE 7 +#define DM_FIND_EVENTMSG 8 +#define DM_GET_ALLOCINFO 9 +#define DM_GET_BULKALL 10 +#define DM_GET_BULKATTR 11 +#define DM_GET_CONFIG 12 +#define DM_GET_CONFIG_EVENTS 13 +#define DM_GET_DIOINFO 14 +#define DM_GET_DIRATTRS 15 +#define DM_GET_DMATTR 16 +#define DM_GET_EVENTLIST 17 +#define DM_GET_EVENTS 18 +#define DM_GET_FILEATTR 19 +#define DM_GET_MOUNTINFO 20 +#define DM_GET_REGION 21 +#define DM_GETALL_DISP 22 +#define DM_GETALL_DMATTR 23 +#define DM_GETALL_INHERIT 24 +#define DM_GETALL_SESSIONS 25 +#define DM_GETALL_TOKENS 26 +#define DM_INIT_ATTRLOC 27 +#define DM_MKDIR_BY_HANDLE 28 +#define DM_MOVE_EVENT 29 +#define DM_OBJ_REF_HOLD 30 +#define DM_OBJ_REF_QUERY 31 +#define DM_OBJ_REF_RELE 32 +#define DM_PATH_TO_FSHANDLE 33 +#define DM_PATH_TO_HANDLE 34 +#define DM_PENDING 35 +#define DM_PROBE_HOLE 36 +#define DM_PUNCH_HOLE 37 +#define DM_QUERY_RIGHT 38 +#define DM_QUERY_SESSION 39 +#define DM_READ_INVIS 40 +#define DM_RELEASE_RIGHT 41 +#define DM_REMOVE_DMATTR 42 +#define DM_REQUEST_RIGHT 43 +#define DM_RESPOND_EVENT 44 +#define DM_SEND_MSG 45 +#define DM_SET_DISP 46 +#define DM_SET_DMATTR 47 +#define DM_SET_EVENTLIST 48 +#define DM_SET_FILEATTR 49 +#define DM_SET_INHERIT 50 +#define DM_SET_REGION 51 +#define DM_SET_RETURN_ON_DESTROY 52 +#define DM_SYMLINK_BY_HANDLE 53 +#define DM_SYNC_BY_HANDLE 54 +#define DM_UPGRADE_RIGHT 55 +#define DM_WRITE_INVIS 56 +#define DM_OPEN_BY_HANDLE 57 + +#endif /* __DMAPI_KERN_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/dmapi/dmapi_mountinfo.c linux-2.4-xfs/fs/xfs/dmapi/dmapi_mountinfo.c --- linux-2.4.19/fs/xfs/dmapi/dmapi_mountinfo.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/dmapi/dmapi_mountinfo.c Mon Aug 26 19:22:13 2002 @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "dmapi_private.h" + +#ifdef linux +/* XXX */ +#define vfsmax 1 +#endif + +typedef struct { + int support_type; + char name[16]; + dm_fsys_vector_t *vptr; +} dm_vector_map_t; + +/* Values for the support_type field. */ + +#define DM_SUPPORT_UNKNOWN 0 +#define DM_SUPPORT_AVAIL 1 + + +dm_vector_map_t *dm_fsys_map = NULL; + + +int +dm_code_level(void) +{ + return(DM_CLVL_XOPEN); /* initial X/Open compliant release */ +} + + +/* Dummy routine which is stored in each function vector slot for which the + filesystem provides no function of its own. If an application calls the + function, he will just get ENOSYS. +*/ + +static int +dm_enosys(void) +{ + return(ENOSYS); /* function not supported by filesystem */ +} + + +/* dm_query_fsys_for_vector() asks a filesystem for its list of supported + DMAPI functions, and builds a dm_vector_map_t structure based upon the + reply. We ignore functions supported by the filesystem which we do not + know about, and we substitute the subroutine 'dm_enosys' for each function + we know about but the filesystem does not support. +*/ + +static void +dm_query_fsys_for_vector( + vnode_t *vp) +{ + dm_vector_map_t *map; + fsys_function_vector_t *vecp; + dm_fsys_vector_t *vptr; + dm_fcntl_vector_t vecrq; + struct vfs *vfsp = vp->v_vfsp; + int fstype; + int error; + int i; + + /* XXX fstype = vfsp->vfs_fstype */ + fstype = 0; + map = &dm_fsys_map[fstype]; + + /* Clear out any information left from a previous filesystem that was + in this slot and initialize it for the new filesystem. + */ + + if (map->vptr) { + kfree(map->vptr); + map->vptr = NULL; + } + + /* XXX strcpy(map->name, vfssw[fstype].vsw_name); */ + strcpy(map->name, XFS_NAME); + + map->support_type = DM_SUPPORT_AVAIL; + + /* Next allocate a function vector and initialize all fields with a + dummy function that returns ENOSYS. + */ + + vptr = map->vptr = kmalloc(sizeof(*map->vptr), SLAB_KERNEL); + if (vptr == NULL) { + printk("%s/%d: kmalloc returned NULL\n", __FUNCTION__, __LINE__); + return; + } + + strncpy(vptr->fsys_name, map->name, sizeof(vptr->fsys_name)); + vptr->code_level = 0; + vptr->clear_inherit = (dm_fsys_clear_inherit_t)dm_enosys; + vptr->create_by_handle = (dm_fsys_create_by_handle_t)dm_enosys; + vptr->downgrade_right = (dm_fsys_downgrade_right_t)dm_enosys; + vptr->get_allocinfo_rvp = (dm_fsys_get_allocinfo_rvp_t)dm_enosys; + vptr->get_bulkall_rvp = (dm_fsys_get_bulkall_rvp_t)dm_enosys; + vptr->get_bulkattr_rvp = (dm_fsys_get_bulkattr_rvp_t)dm_enosys; + vptr->get_config = (dm_fsys_get_config_t)dm_enosys; + vptr->get_config_events = (dm_fsys_get_config_events_t)dm_enosys; + vptr->get_destroy_dmattr = (dm_fsys_get_destroy_dmattr_t)dm_enosys; + vptr->get_dioinfo = (dm_fsys_get_dioinfo_t)dm_enosys; + vptr->get_dirattrs_rvp = (dm_fsys_get_dirattrs_rvp_t)dm_enosys; + vptr->get_dmattr = (dm_fsys_get_dmattr_t)dm_enosys; + vptr->get_eventlist = (dm_fsys_get_eventlist_t)dm_enosys; + vptr->get_fileattr = (dm_fsys_get_fileattr_t)dm_enosys; + vptr->get_region = (dm_fsys_get_region_t)dm_enosys; + vptr->getall_dmattr = (dm_fsys_getall_dmattr_t)dm_enosys; + vptr->getall_inherit = (dm_fsys_getall_inherit_t)dm_enosys; + vptr->init_attrloc = (dm_fsys_init_attrloc_t)dm_enosys; + vptr->mkdir_by_handle = (dm_fsys_mkdir_by_handle_t)dm_enosys; + vptr->probe_hole = (dm_fsys_probe_hole_t)dm_enosys; + vptr->punch_hole = (dm_fsys_punch_hole_t)dm_enosys; + vptr->read_invis_rvp = (dm_fsys_read_invis_rvp_t)dm_enosys; + vptr->release_right = (dm_fsys_release_right_t)dm_enosys; + vptr->request_right = (dm_fsys_request_right_t)dm_enosys; + vptr->remove_dmattr = (dm_fsys_remove_dmattr_t)dm_enosys; + vptr->set_dmattr = (dm_fsys_set_dmattr_t)dm_enosys; + vptr->set_eventlist = (dm_fsys_set_eventlist_t)dm_enosys; + vptr->set_fileattr = (dm_fsys_set_fileattr_t)dm_enosys; + vptr->set_inherit = (dm_fsys_set_inherit_t)dm_enosys; + vptr->set_region = (dm_fsys_set_region_t)dm_enosys; + vptr->symlink_by_handle = (dm_fsys_symlink_by_handle_t)dm_enosys; + vptr->sync_by_handle = (dm_fsys_sync_by_handle_t)dm_enosys; + vptr->upgrade_right = (dm_fsys_upgrade_right_t)dm_enosys; + vptr->write_invis_rvp = (dm_fsys_write_invis_rvp_t)dm_enosys; + vptr->obj_ref_hold = (dm_fsys_obj_ref_hold_t)dm_enosys; + + /* Issue a VFS_DMAPI_FSYS_VECTOR to the filesystem in order to obtain + its vector of filesystem-specific DMAPI routines. + */ + + vecrq.count = 0; + vecrq.vecp = NULL; + + VFS_DMAPI_FSYS_VECTOR(vfsp, &vecrq, error); + + /* If we still have an error at this point, then the filesystem simply + does not support DMAPI, so we give up with all functions set to + ENOSYS. + */ + + if (error || vecrq.count == 0) + return; + + /* The request succeeded and we were given a vector which we need to + map to our current level. Overlay the dummy function with every + filesystem function we understand. + */ + + vptr->code_level = vecrq.code_level; + vecp = vecrq.vecp; + for (i = 0; i < vecrq.count; i++) { + switch (vecp[i].func_no) { + case DM_FSYS_CLEAR_INHERIT: + vptr->clear_inherit = vecp[i].u_fc.clear_inherit; + break; + case DM_FSYS_CREATE_BY_HANDLE: + vptr->create_by_handle = vecp[i].u_fc.create_by_handle; + break; + case DM_FSYS_DOWNGRADE_RIGHT: + vptr->downgrade_right = vecp[i].u_fc.downgrade_right; + break; + case DM_FSYS_GET_ALLOCINFO_RVP: + vptr->get_allocinfo_rvp = vecp[i].u_fc.get_allocinfo_rvp; + break; + case DM_FSYS_GET_BULKALL_RVP: + vptr->get_bulkall_rvp = vecp[i].u_fc.get_bulkall_rvp; + break; + case DM_FSYS_GET_BULKATTR_RVP: + vptr->get_bulkattr_rvp = vecp[i].u_fc.get_bulkattr_rvp; + break; + case DM_FSYS_GET_CONFIG: + vptr->get_config = vecp[i].u_fc.get_config; + break; + case DM_FSYS_GET_CONFIG_EVENTS: + vptr->get_config_events = vecp[i].u_fc.get_config_events; + break; + case DM_FSYS_GET_DESTROY_DMATTR: + vptr->get_destroy_dmattr = vecp[i].u_fc.get_destroy_dmattr; + break; + case DM_FSYS_GET_DIOINFO: + vptr->get_dioinfo = vecp[i].u_fc.get_dioinfo; + break; + case DM_FSYS_GET_DIRATTRS_RVP: + vptr->get_dirattrs_rvp = vecp[i].u_fc.get_dirattrs_rvp; + break; + case DM_FSYS_GET_DMATTR: + vptr->get_dmattr = vecp[i].u_fc.get_dmattr; + break; + case DM_FSYS_GET_EVENTLIST: + vptr->get_eventlist = vecp[i].u_fc.get_eventlist; + break; + case DM_FSYS_GET_FILEATTR: + vptr->get_fileattr = vecp[i].u_fc.get_fileattr; + break; + case DM_FSYS_GET_REGION: + vptr->get_region = vecp[i].u_fc.get_region; + break; + case DM_FSYS_GETALL_DMATTR: + vptr->getall_dmattr = vecp[i].u_fc.getall_dmattr; + break; + case DM_FSYS_GETALL_INHERIT: + vptr->getall_inherit = vecp[i].u_fc.getall_inherit; + break; + case DM_FSYS_INIT_ATTRLOC: + vptr->init_attrloc = vecp[i].u_fc.init_attrloc; + break; + case DM_FSYS_MKDIR_BY_HANDLE: + vptr->mkdir_by_handle = vecp[i].u_fc.mkdir_by_handle; + break; + case DM_FSYS_PROBE_HOLE: + vptr->probe_hole = vecp[i].u_fc.probe_hole; + break; + case DM_FSYS_PUNCH_HOLE: + vptr->punch_hole = vecp[i].u_fc.punch_hole; + break; + case DM_FSYS_READ_INVIS_RVP: + vptr->read_invis_rvp = vecp[i].u_fc.read_invis_rvp; + break; + case DM_FSYS_RELEASE_RIGHT: + vptr->release_right = vecp[i].u_fc.release_right; + break; + case DM_FSYS_REMOVE_DMATTR: + vptr->remove_dmattr = vecp[i].u_fc.remove_dmattr; + break; + case DM_FSYS_REQUEST_RIGHT: + vptr->request_right = vecp[i].u_fc.request_right; + break; + case DM_FSYS_SET_DMATTR: + vptr->set_dmattr = vecp[i].u_fc.set_dmattr; + break; + case DM_FSYS_SET_EVENTLIST: + vptr->set_eventlist = vecp[i].u_fc.set_eventlist; + break; + case DM_FSYS_SET_FILEATTR: + vptr->set_fileattr = vecp[i].u_fc.set_fileattr; + break; + case DM_FSYS_SET_INHERIT: + vptr->set_inherit = vecp[i].u_fc.set_inherit; + break; + case DM_FSYS_SET_REGION: + vptr->set_region = vecp[i].u_fc.set_region; + break; + case DM_FSYS_SYMLINK_BY_HANDLE: + vptr->symlink_by_handle = vecp[i].u_fc.symlink_by_handle; + break; + case DM_FSYS_SYNC_BY_HANDLE: + vptr->sync_by_handle = vecp[i].u_fc.sync_by_handle; + break; + case DM_FSYS_UPGRADE_RIGHT: + vptr->upgrade_right = vecp[i].u_fc.upgrade_right; + break; + case DM_FSYS_WRITE_INVIS_RVP: + vptr->write_invis_rvp = vecp[i].u_fc.write_invis_rvp; + break; + case DM_FSYS_OBJ_REF_HOLD: + vptr->obj_ref_hold = vecp[i].u_fc.obj_ref_hold; + break; + default: /* ignore ones we don't understand */ + break; + } + } +} + + +/* Given a behavior pointer, dm_fsys_vector() returns a pointer to the DMAPI + function vector to be used for the corresponding vnode. There is one possible + function vector for each filesystem type, although currently XFS is the + only filesystem that actually supports DMAPI. +*/ + +dm_fsys_vector_t * +dm_fsys_vector( + vnode_t *vp) +{ + dm_vector_map_t *map; + int fstype; + + /* XXX fstype = vp->v_vfsp->vfs_fstype */ + fstype = 0; + + /* If this is the first call, initialize the filesystem function + vector map. + */ + + if (dm_fsys_map == NULL) { + int size = vfsmax * sizeof(*dm_fsys_map); + + dm_fsys_map = (dm_vector_map_t *)kmalloc(size, GFP_KERNEL); + if (dm_fsys_map == NULL) { + printk("%s/%d: kmalloc returned NULL\n", __FUNCTION__, __LINE__); + return NULL; + } + memset(dm_fsys_map, 0, size); + } + map = &dm_fsys_map[fstype]; + + /* If a new filesystem has been dynamically loaded into a slot + previously held by another filesystem, then treat it as a + DM_SUPPORT_UNKNOWN. + */ + + /* XXX if (strcmp(map->name, vfssw[fstype].vsw_name)) */ + if (strcmp(map->name, XFS_NAME)) + map->support_type = DM_SUPPORT_UNKNOWN; + + /* If we don't yet know what the filesystem supports, ask it. */ + + if (map->support_type == DM_SUPPORT_UNKNOWN) + dm_query_fsys_for_vector(vp); + + /* Now return the function vector. */ + + return(map->vptr); +} + + +void +dm_fsys_vector_free() +{ + dm_vector_map_t *map; + int i; + + if (dm_fsys_map) { + for (i = 0; i < vfsmax; i++){ + map = &dm_fsys_map[i]; + if (map->vptr) + kfree(map->vptr); + } + kfree(dm_fsys_map); + } +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/dmapi/dmapi_private.h linux-2.4-xfs/fs/xfs/dmapi/dmapi_private.h --- linux-2.4.19/fs/xfs/dmapi/dmapi_private.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/dmapi/dmapi_private.h Wed Sep 4 23:38:40 2002 @@ -0,0 +1,604 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef _DMAPI_PRIVATE_H +#define _DMAPI_PRIVATE_H + +#include "xfs.h" + +#ifdef CONFIG_PROC_FS +#define DMAPI_PROCFS "fs/xfs_dmapi_v2" /* DMAPI device in /proc. */ +#define DMAPI_DBG_PROCFS "fs/xfs_dmapi_d" /* DMAPI debugging dir */ +#endif + +extern struct kmem_cache_s *dm_fsreg_cachep; +extern struct kmem_cache_s *dm_tokdata_cachep; +extern struct kmem_cache_s *dm_session_cachep; + + + +typedef struct dm_tokdata { + struct dm_tokdata *td_next; + struct dm_tokevent *td_tevp; /* pointer to owning tevp */ + int td_app_ref; /* # app threads currently active */ + dm_right_t td_orig_right; /* original right held when created */ + dm_right_t td_right; /* current right held for this handle */ + short td_flags; + short td_type; /* object type */ + int td_vcount; /* # of current application VN_HOLDs */ + vnode_t *td_vp; /* vnode pointer */ + xfs_handle_t td_handle; /* handle for vp or vfsp */ +} dm_tokdata_t; + +/* values for td_type */ + +#define DM_TDT_NONE 0x00 /* td_handle is empty */ +#define DM_TDT_VFS 0x01 /* td_handle points to a vfs */ +#define DM_TDT_REG 0x02 /* td_handle points to a file */ +#define DM_TDT_DIR 0x04 /* td_handle points to a directory */ +#define DM_TDT_LNK 0x08 /* td_handle points to a symlink */ +#define DM_TDT_OTH 0x10 /* some other object eg. pipe, socket */ + +#define DM_TDT_VNO (DM_TDT_REG|DM_TDT_DIR|DM_TDT_LNK|DM_TDT_OTH) +#define DM_TDT_ANY (DM_TDT_VFS|DM_TDT_REG|DM_TDT_DIR|DM_TDT_LNK|DM_TDT_OTH) + +/* values for td_flags */ + +#define DM_TDF_ORIG 0x0001 /* part of the original event */ +#define DM_TDF_EVTREF 0x0002 /* event thread holds vnode reference */ +#define DM_TDF_STHREAD 0x0004 /* only one app can use this handle */ +#define DM_TDF_RIGHT 0x0008 /* vcount bumped for dm_request_right */ +#define DM_TDF_HOLD 0x0010 /* vcount bumped for dm_obj_ref_hold */ + + +/* Because some events contain __u64 fields, we force te_msg and te_event + to always be 8-byte aligned. In order to send more than one message in + a single dm_get_events() call, we also ensure that each message is an + 8-byte multiple. +*/ + +typedef struct dm_tokevent { + struct dm_tokevent *te_next; + struct dm_tokevent *te_hashnext; /* hash chain */ + lock_t te_lock; /* lock for all fields but te_*next. + * te_next and te_hashnext are + * protected by the session lock. + */ + short te_flags; + short te_allocsize; /* alloc'ed size of this structure */ + sv_t te_evt_queue; /* queue waiting for dm_respond_event */ + sv_t te_app_queue; /* queue waiting for handle access */ + int te_evt_ref; /* number of event procs using token */ + int te_app_ref; /* number of app procs using token */ + int te_app_slp; /* number of app procs sleeping */ + int te_reply; /* return errno for sync messages */ + dm_tokdata_t *te_tdp; /* list of handle/right pairs */ + union { + __u64 align; /* force alignment of te_msg */ + dm_eventmsg_t te_msg; /* user visible part */ + } te_u; + __u64 te_event; /* start of dm_xxx_event_t message */ +} dm_tokevent_t; + +#define te_msg te_u.te_msg + +/* values for te_flags */ + +#define DM_TEF_LOCKED 0x0001 /* event "locked" by dm_get_events() */ +#define DM_TEF_INTERMED 0x0002 /* a dm_pending reply was received */ +#define DM_TEF_FINAL 0x0004 /* dm_respond_event has been received */ +#ifdef __sgi +#define DM_TEF_HASHED 0x0010 /* event is on hash chain */ +#endif + + +#ifdef __sgi +#ifdef DEBUG +#define DM_SHASH_DEBUG +#endif + +typedef struct dm_sesshash { + dm_tokevent_t *h_next; /* ptr to chain of tokevents */ +#ifdef DM_SHASH_DEBUG + int maxlength; + int curlength; + int num_adds; + int num_dels; + int dup_hits; +#endif +} dm_sesshash_t; +#endif + + +typedef struct dm_eventq { + dm_tokevent_t *eq_head; + dm_tokevent_t *eq_tail; + int eq_count; /* size of queue */ +} dm_eventq_t; + + +typedef struct dm_session { + struct dm_session *sn_next; /* sessions linkage */ + dm_sessid_t sn_sessid; /* user-visible session number */ + u_int sn_flags; + lock_t sn_qlock; /* lock for newq/delq related fields */ + sv_t sn_readerq; /* waiting for message on sn_newq */ + sv_t sn_writerq; /* waiting for room on sn_newq */ + u_int sn_readercnt; /* count of waiting readers */ + u_int sn_writercnt; /* count of waiting readers */ + dm_eventq_t sn_newq; /* undelivered event queue */ + dm_eventq_t sn_delq; /* delivered event queue */ + dm_eventq_t sn_evt_writerq; /* events of thrds in sn_writerq */ +#ifdef __sgi + dm_sesshash_t *sn_sesshash; /* buckets for tokevent hash chains */ +#ifdef DM_SHASH_DEBUG + int sn_buckets_in_use; + int sn_max_buckets_in_use; +#endif +#endif + char sn_info[DM_SESSION_INFO_LEN]; /* user-supplied info */ +} dm_session_t; + +/* values for sn_flags */ + +#define DM_SN_WANTMOUNT 0x0001 /* session wants to get mount events */ + + +typedef enum { + DM_STATE_MOUNTING, + DM_STATE_MOUNTED, + DM_STATE_UNMOUNTING, + DM_STATE_UNMOUNTED +} dm_fsstate_t; + + +typedef struct dm_fsreg { + struct dm_fsreg *fr_next; + vfs_t *fr_vfsp; /* filesystem pointer */ + dm_tokevent_t *fr_tevp; + fsid_t fr_fsid; /* filesystem ID */ + void *fr_msg; /* dm_mount_event_t for filesystem */ + int fr_msgsize; /* size of dm_mount_event_t */ + dm_fsstate_t fr_state; + sv_t fr_dispq; + int fr_dispcnt; + dm_eventq_t fr_evt_dispq; /* events of thrds in fr_dispq */ + sv_t fr_queue; /* queue for hdlcnt/vfscnt/unmount */ + lock_t fr_lock; + int fr_hdlcnt; /* threads blocked during unmount */ + int fr_vfscnt; /* threads in VFS_VGET or VFS_ROOT */ + int fr_unmount; /* if non-zero, umount is sleeping */ + dm_attrname_t fr_rattr; /* dm_set_return_on_destroy attribute */ + dm_session_t *fr_sessp [DM_EVENT_MAX]; +} dm_fsreg_t; + + + + +/* events valid in dm_set_disp() when called with a filesystem handle. */ + +#define DM_VALID_DISP_EVENTS ( \ + (1 << DM_EVENT_PREUNMOUNT) | \ + (1 << DM_EVENT_UNMOUNT) | \ + (1 << DM_EVENT_NOSPACE) | \ + (1 << DM_EVENT_DEBUT) | \ + (1 << DM_EVENT_CREATE) | \ + (1 << DM_EVENT_POSTCREATE) | \ + (1 << DM_EVENT_REMOVE) | \ + (1 << DM_EVENT_POSTREMOVE) | \ + (1 << DM_EVENT_RENAME) | \ + (1 << DM_EVENT_POSTRENAME) | \ + (1 << DM_EVENT_LINK) | \ + (1 << DM_EVENT_POSTLINK) | \ + (1 << DM_EVENT_SYMLINK) | \ + (1 << DM_EVENT_POSTSYMLINK) | \ + (1 << DM_EVENT_READ) | \ + (1 << DM_EVENT_WRITE) | \ + (1 << DM_EVENT_TRUNCATE) | \ + (1 << DM_EVENT_ATTRIBUTE) | \ + (1 << DM_EVENT_DESTROY) ) + + +/* isolate the read/write/trunc events of a dm_tokevent_t */ + +#define DM_EVENT_RDWRTRUNC(tevp) ( \ + ((tevp)->te_msg.ev_type == DM_EVENT_READ) || \ + ((tevp)->te_msg.ev_type == DM_EVENT_WRITE) || \ + ((tevp)->te_msg.ev_type == DM_EVENT_TRUNCATE) ) + + +/* + * Global handle hack isolation. + */ + +#define DM_GLOBALHAN(hanp, hlen) (((hanp) == DM_GLOBAL_HANP) && \ + ((hlen) == DM_GLOBAL_HLEN)) + + +#define DM_MAX_MSG_DATA 3960 + + + +/* Supported filesystem function vector functions. */ + + +typedef struct { + int code_level; + char fsys_name[16]; + dm_fsys_clear_inherit_t clear_inherit; + dm_fsys_create_by_handle_t create_by_handle; + dm_fsys_downgrade_right_t downgrade_right; + dm_fsys_get_allocinfo_rvp_t get_allocinfo_rvp; + dm_fsys_get_bulkall_rvp_t get_bulkall_rvp; + dm_fsys_get_bulkattr_rvp_t get_bulkattr_rvp; + dm_fsys_get_config_t get_config; + dm_fsys_get_config_events_t get_config_events; + dm_fsys_get_destroy_dmattr_t get_destroy_dmattr; + dm_fsys_get_dioinfo_t get_dioinfo; + dm_fsys_get_dirattrs_rvp_t get_dirattrs_rvp; + dm_fsys_get_dmattr_t get_dmattr; + dm_fsys_get_eventlist_t get_eventlist; + dm_fsys_get_fileattr_t get_fileattr; + dm_fsys_get_region_t get_region; + dm_fsys_getall_dmattr_t getall_dmattr; + dm_fsys_getall_inherit_t getall_inherit; + dm_fsys_init_attrloc_t init_attrloc; + dm_fsys_mkdir_by_handle_t mkdir_by_handle; + dm_fsys_probe_hole_t probe_hole; + dm_fsys_punch_hole_t punch_hole; + dm_fsys_read_invis_rvp_t read_invis_rvp; + dm_fsys_release_right_t release_right; + dm_fsys_remove_dmattr_t remove_dmattr; + dm_fsys_request_right_t request_right; + dm_fsys_set_dmattr_t set_dmattr; + dm_fsys_set_eventlist_t set_eventlist; + dm_fsys_set_fileattr_t set_fileattr; + dm_fsys_set_inherit_t set_inherit; + dm_fsys_set_region_t set_region; + dm_fsys_symlink_by_handle_t symlink_by_handle; + dm_fsys_sync_by_handle_t sync_by_handle; + dm_fsys_upgrade_right_t upgrade_right; + dm_fsys_write_invis_rvp_t write_invis_rvp; + dm_fsys_obj_ref_hold_t obj_ref_hold; +} dm_fsys_vector_t; + + +extern dm_session_t *dm_sessions; /* head of session list */ +extern dm_fsreg_t *dm_registers; +extern lock_t dm_reg_lock; /* lock for registration list */ + +/* + * Kernel only prototypes. + */ + +int dm_find_session_and_lock( + dm_sessid_t sid, + dm_session_t **sessionpp, + unsigned long *lcp); + +int dm_find_msg_and_lock( + dm_sessid_t sid, + dm_token_t token, + dm_tokevent_t **tevpp, + unsigned long *lcp); + +dm_tokevent_t * dm_evt_create_tevp( + dm_eventtype_t event, + int variable_size, + void **msgpp); + +int dm_app_get_tdp( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + short types, + dm_right_t right, + dm_tokdata_t **tdpp); + +int dm_get_config_tdp( + void *hanp, + size_t hlen, + dm_tokdata_t **tdpp); + +void dm_app_put_tdp( + dm_tokdata_t *tdp); + +void dm_put_tevp( + dm_tokevent_t *tevp, + dm_tokdata_t *tdp); + +void dm_evt_rele_tevp( + dm_tokevent_t *tevp, + int droprights); + +int dm_enqueue_normal_event( + vfs_t *vfsp, + dm_tokevent_t *tevp, + int flags); + +int dm_enqueue_mount_event( + vfs_t *vfsp, + dm_tokevent_t *tevp); + +int dm_enqueue_sendmsg_event( + dm_sessid_t targetsid, + dm_tokevent_t *tevp, + int synch); + +int dm_enqueue_user_event( + dm_sessid_t sid, + dm_tokevent_t *tevp, + dm_token_t *tokenp); + +int dm_obj_ref_query_rvp( + dm_sessid_t sid, + dm_token_t token, + void *hanp, + size_t hlen, + int *rvp); + +int dm_read_invis_rvp( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_off_t off, + dm_size_t len, + void *bufp, + int *rvp); + +int dm_write_invis_rvp( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + int flags, + dm_off_t off, + dm_size_t len, + void *bufp, + int *rvp); + +int dm_get_bulkattr_rvp( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + u_int mask, + dm_attrloc_t *locp, + size_t buflen, + void *bufp, + size_t *rlenp, + int *rvp); + +int dm_get_bulkall_rvp( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + u_int mask, + dm_attrname_t *attrnamep, + dm_attrloc_t *locp, + size_t buflen, + void *bufp, + size_t *rlenp, + int *rvp); + +int dm_get_dirattrs_rvp( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + u_int mask, + dm_attrloc_t *locp, + size_t buflen, + void *bufp, + size_t *rlenp, + int *rvp); + +int dm_get_allocinfo_rvp( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_off_t *offp, + u_int nelem, + dm_extent_t *extentp, + u_int *nelemp, + int *rvp); + +int dm_waitfor_destroy_attrname( + vfs_t *vfsp, + dm_attrname_t *attrnamep); + +void dm_clear_fsreg( + dm_session_t *s); + +int dm_add_fsys_entry( + vfs_t *vfsp, + dm_tokevent_t *tevp); + +void dm_change_fsys_entry( + vfs_t *vfsp, + dm_fsstate_t newstate); + +void dm_remove_fsys_entry( + vfs_t *vfsp); + +dm_fsys_vector_t *dm_fsys_vector( + vnode_t *vp); + +void dm_fsys_vector_free(void); + +int dm_waitfor_disp_session( + vfs_t *vfsp, + dm_tokevent_t *tevp, + dm_session_t **sessionpp, + unsigned long *lcp); + +vnode_t * dm_handle_to_vp ( + xfs_handle_t *handlep, + short *typep); + +int dm_check_dmapi_vp( + vnode_t *vp); + +dm_tokevent_t * dm_find_mount_tevp_and_lock( + fsid_t *fsidp, + unsigned long *lcp); + +int dm_path_to_hdl( + char *path, + void *hanp, + size_t *hlenp); + +int dm_path_to_fshdl( + char *path, + void *hanp, + size_t *hlenp); + +int dm_fd_to_hdl( + int fd, + void *hanp, + size_t *hlenp); + +int dm_upgrade_right( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token); + +int dm_downgrade_right( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token); + +int dm_request_right( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + u_int flags, + dm_right_t right); + +int dm_release_right( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token); + +int dm_query_right( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_right_t *rightp); + + +int dm_set_eventlist( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_eventset_t *eventsetp, + u_int maxevent); + +int dm_obj_ref_hold( + dm_sessid_t sid, + dm_token_t token, + void *hanp, + size_t hlen); + +int dm_obj_ref_rele( + dm_sessid_t sid, + dm_token_t token, + void *hanp, + size_t hlen); + +int dm_get_eventlist( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + u_int nelem, + dm_eventset_t *eventsetp, + u_int *nelemp); + + +int dm_set_disp( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_eventset_t *eventsetp, + u_int maxevent); + + +int dm_set_return_on_destroy( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_attrname_t *attrnamep, + dm_boolean_t enable); + + +int dm_get_mountinfo( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + size_t buflen, + void *bufp, + size_t *rlenp); + +void dm_link_event( + dm_tokevent_t *tevp, + dm_eventq_t *queue); + +void dm_unlink_event( + dm_tokevent_t *tevp, + dm_eventq_t *queue); + +int dm_open_by_handle_rvp( + unsigned int fd, + void *hanp, + size_t hlen, + int mode, + int *rvp); + +int dm_copyin_handle( + void *hanp, + size_t hlen, + xfs_handle_t *handlep); + +#endif /* _DMAPI_PRIVATE_H */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/dmapi/dmapi_region.c linux-2.4-xfs/fs/xfs/dmapi/dmapi_region.c --- linux-2.4.19/fs/xfs/dmapi/dmapi_region.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/dmapi/dmapi_region.c Tue Jul 30 18:08:15 2002 @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "dmapi_private.h" + + +int +dm_get_region( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + u_int nelem, + dm_region_t *regbufp, + u_int *nelemp) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_REG, + DM_RIGHT_SHARED, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->get_region(tdp->td_vp, tdp->td_right, + nelem, regbufp, nelemp); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} + + + +int +dm_set_region( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + u_int nelem, + dm_region_t *regbufp, + dm_boolean_t *exactflagp) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_REG, + DM_RIGHT_EXCL, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->set_region(tdp->td_vp, tdp->td_right, + nelem, regbufp, exactflagp); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/dmapi/dmapi_register.c linux-2.4-xfs/fs/xfs/dmapi/dmapi_register.c --- linux-2.4.19/fs/xfs/dmapi/dmapi_register.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/dmapi/dmapi_register.c Sat Aug 24 17:08:28 2002 @@ -0,0 +1,1555 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "dmapi_private.h" +#include +#include +#include + +dm_fsreg_t *dm_registers; /* head of filesystem registration list */ +int dm_fsys_cnt; /* number of filesystems on dm_registers list */ +lock_t dm_reg_lock = SPIN_LOCK_UNLOCKED;/* lock for dm_registers */ + + + +#ifdef CONFIG_PROC_FS +static int +fsreg_read_pfs(char *buffer, char **start, off_t offset, + int count, int *eof, void *data) +{ + int len; + int i; + dm_fsreg_t *fsrp = (dm_fsreg_t*)data; + char statebuf[30]; + +#define CHKFULL if(len >= count) break; +#define ADDBUF(a,b) len += sprintf(buffer + len, a, b); CHKFULL; + + switch (fsrp->fr_state) { + case DM_STATE_MOUNTING: sprintf(statebuf, "mounting"); break; + case DM_STATE_MOUNTED: sprintf(statebuf, "mounted"); break; + case DM_STATE_UNMOUNTING: sprintf(statebuf, "unmounting"); break; + case DM_STATE_UNMOUNTED: sprintf(statebuf, "unmounted"); break; + default: + sprintf(statebuf, "unknown:%d", (int)fsrp->fr_state); + break; + } + + len=0; + while(1){ + ADDBUF("fsrp=0x%p\n", fsrp); + ADDBUF("fr_next=0x%p\n", fsrp->fr_next); + ADDBUF("fr_vfsp=0x%p\n", fsrp->fr_vfsp); + ADDBUF("fr_tevp=0x%p\n", fsrp->fr_tevp); + ADDBUF("fr_fsid=%c\n", '?'); + ADDBUF("fr_msg=0x%p\n", fsrp->fr_msg); + ADDBUF("fr_msgsize=%d\n", fsrp->fr_msgsize); + ADDBUF("fr_state=%s\n", statebuf); + ADDBUF("fr_dispq=%c\n", '?'); + ADDBUF("fr_dispcnt=%d\n", fsrp->fr_dispcnt); + + ADDBUF("fr_evt_dispq.eq_head=0x%p\n", fsrp->fr_evt_dispq.eq_head); + ADDBUF("fr_evt_dispq.eq_tail=0x%p\n", fsrp->fr_evt_dispq.eq_tail); + ADDBUF("fr_evt_dispq.eq_count=%d\n", fsrp->fr_evt_dispq.eq_count); + + ADDBUF("fr_queue=%c\n", '?'); + ADDBUF("fr_lock=%c\n", '?'); + ADDBUF("fr_hdlcnt=%d\n", fsrp->fr_hdlcnt); + ADDBUF("fr_vfscnt=%d\n", fsrp->fr_vfscnt); + ADDBUF("fr_unmount=%d\n", fsrp->fr_unmount); + + len += sprintf(buffer + len, "fr_rattr="); + CHKFULL; + for(i = 0; i <= DM_ATTR_NAME_SIZE; ++i){ + ADDBUF("%c", fsrp->fr_rattr.an_chars[i]); + } + CHKFULL; + len += sprintf(buffer + len, "\n"); + CHKFULL; + + for(i = 0; i < DM_EVENT_MAX; i++){ + if( fsrp->fr_sessp[i] != NULL ){ + ADDBUF("fr_sessp[%d]=", i); + ADDBUF("0x%p\n", fsrp->fr_sessp[i]); + } + } + CHKFULL; + + break; + } + + if (offset >= len) { + *start = buffer; + *eof = 1; + return 0; + } + *start = buffer + offset; + if ((len -= offset) > count) + return count; + *eof = 1; + + return len; +} +#endif + + +/* Returns a pointer to the filesystem structure for the filesystem + referenced by vfsp. The caller is responsible for obtaining dm_reg_lock + before calling this routine. +*/ + +static dm_fsreg_t * +dm_find_fsreg( + fsid_t *fsidp) +{ + dm_fsreg_t *fsrp; + + for (fsrp = dm_registers; fsrp; fsrp = fsrp->fr_next) { + if (!bcmp(&fsrp->fr_fsid, fsidp, sizeof(*fsidp))) + break; + } + return(fsrp); +} + + +/* Given a fsid_t, dm_find_fsreg_and_lock() finds the dm_fsreg_t structure + for that filesytem if one exists, and returns a pointer to the structure + after obtaining its 'fr_lock' so that the caller can safely modify the + dm_fsreg_t. The caller is responsible for releasing 'fr_lock'. +*/ + +static dm_fsreg_t * +dm_find_fsreg_and_lock( + fsid_t *fsidp, + unsigned long *lcp) /* address of returned lock cookie */ +{ + dm_fsreg_t *fsrp; + + for (;;) { + *lcp = mutex_spinlock(&dm_reg_lock); + + if ((fsrp = dm_find_fsreg(fsidp)) == NULL) { + mutex_spinunlock(&dm_reg_lock, *lcp); + return(NULL); + } + if (spin_trylock(&fsrp->fr_lock)) { + nested_spinunlock(&dm_reg_lock); + return(fsrp); /* success */ + } + + /* If the second lock is not available, drop the first and + start over. This gives the CPU a chance to process any + interrupts, and also allows processes which want a fr_lock + for a different filesystem to proceed. + */ + + mutex_spinunlock(&dm_reg_lock, *lcp); + } +} + + +/* dm_add_fsys_entry() is called when a DM_EVENT_MOUNT event is about to be + sent. It creates a dm_fsreg_t structure for the filesystem and stores a + pointer to a copy of the mount event within that structure so that it is + available for subsequent dm_get_mountinfo() calls. +*/ + +int +dm_add_fsys_entry( + vfs_t *vfsp, + dm_tokevent_t *tevp) +{ + dm_fsreg_t *fsrp; + int msgsize; + void *msg; + unsigned long lc; /* lock cookie */ + + /* Allocate and initialize a dm_fsreg_t structure for the filesystem. */ + + msgsize = tevp->te_allocsize - offsetof(dm_tokevent_t, te_event); + msg = kmalloc(msgsize, GFP_KERNEL); + if (msg == NULL) { + printk("%s/%d: kmalloc returned NULL\n", __FUNCTION__, __LINE__); + return ENOMEM; + } + bcopy(&tevp->te_event, msg, msgsize); + + fsrp = kmem_cache_alloc(dm_fsreg_cachep, SLAB_KERNEL); + if (fsrp == NULL) { + kfree(msg); + printk("%s/%d: kmem_cache_alloc(dm_fsreg_cachep) returned NULL\n", __FUNCTION__, __LINE__); + return ENOMEM; + } + memset(fsrp, 0, sizeof(*fsrp)); + + fsrp->fr_vfsp = vfsp; + fsrp->fr_tevp = tevp; + fsrp->fr_fsid = *vfsp->vfs_altfsid; + fsrp->fr_msg = msg; + fsrp->fr_msgsize = msgsize; + fsrp->fr_state = DM_STATE_MOUNTING; + sv_init(&fsrp->fr_dispq, SV_DEFAULT, "fr_dispq"); + sv_init(&fsrp->fr_queue, SV_DEFAULT, "fr_queue"); + spinlock_init(&fsrp->fr_lock, "fr_lock"); + + /* If no other mounted DMAPI filesystem already has this same + fsid_t, then add this filesystem to the list. + */ + + lc = mutex_spinlock(&dm_reg_lock); + + if (!dm_find_fsreg(vfsp->vfs_altfsid)) { + fsrp->fr_next = dm_registers; + dm_registers = fsrp; + dm_fsys_cnt++; +#ifdef CONFIG_PROC_FS + { + char buf[100]; + struct proc_dir_entry *entry; + + sprintf(buf, DMAPI_DBG_PROCFS "/fsreg/0x%p", fsrp); + entry = create_proc_read_entry(buf, 0, 0, fsreg_read_pfs, fsrp); + entry->owner = THIS_MODULE; + } +#endif + mutex_spinunlock(&dm_reg_lock, lc); + return(0); + } + + /* A fsid_t collision occurred, so prevent this new filesystem from + mounting. + */ + + mutex_spinunlock(&dm_reg_lock, lc); + + sv_destroy(&fsrp->fr_dispq); + sv_destroy(&fsrp->fr_queue); + spinlock_destroy(&fsrp->fr_lock); + kfree(msg); + kmem_cache_free(dm_fsreg_cachep, fsrp); + return(EBUSY); +} + + +/* dm_change_fsys_entry() is called whenever a filesystem's mount state is + about to change. The state is changed to DM_STATE_MOUNTED after a + successful DM_EVENT_MOUNT event or after a failed unmount. It is changed + to DM_STATE_UNMOUNTING after a successful DM_EVENT_PREUNMOUNT event. + Finally, the state is changed to DM_STATE_UNMOUNTED after a successful + unmount. It stays in this state until the DM_EVENT_UNMOUNT event is + queued, at which point the filesystem entry is removed. +*/ + +void +dm_change_fsys_entry( + vfs_t *vfsp, + dm_fsstate_t newstate) +{ + dm_fsreg_t *fsrp; + int seq_error; + unsigned long lc; /* lock cookie */ + + /* Find the filesystem referenced by the vfsp's fsid_t. This should + always succeed. + */ + + if ((fsrp = dm_find_fsreg_and_lock(vfsp->vfs_altfsid, &lc)) == NULL) { + panic("dm_change_fsys_entry: can't find DMAPI fsrp for " + "vfsp %p\n", vfsp); + } + + /* Make sure that the new state is acceptable given the current state + of the filesystem. Any error here is a major DMAPI/filesystem + screwup. + */ + + seq_error = 0; + switch (newstate) { + case DM_STATE_MOUNTED: + if (fsrp->fr_state != DM_STATE_MOUNTING && + fsrp->fr_state != DM_STATE_UNMOUNTING) { + seq_error++; + } + break; + case DM_STATE_UNMOUNTING: + if (fsrp->fr_state != DM_STATE_MOUNTED) + seq_error++; + break; + case DM_STATE_UNMOUNTED: + if (fsrp->fr_state != DM_STATE_UNMOUNTING) + seq_error++; + break; + default: + seq_error++; + break; + } + if (seq_error) { + panic("dm_change_fsys_entry: DMAPI sequence error: old state " + "%d, new state %d, fsrp %p\n", fsrp->fr_state, + newstate, fsrp); + } + + /* If the old state was DM_STATE_UNMOUNTING, then processes could be + sleeping in dm_handle_to_vp() waiting for their DM_NO_TOKEN handles + to be translated to vnodes. Wake them up so that they either + continue (new state is DM_STATE_MOUNTED) or fail (new state is + DM_STATE_UNMOUNTED). + */ + + if (fsrp->fr_state == DM_STATE_UNMOUNTING) { + if (fsrp->fr_hdlcnt) + sv_broadcast(&fsrp->fr_queue); + } + + /* Change the filesystem's mount state to its new value. */ + + fsrp->fr_state = newstate; + fsrp->fr_tevp = NULL; /* not valid after DM_STATE_MOUNTING */ + + /* If the new state is DM_STATE_UNMOUNTING, wait until any application + threads currently in the process of making VFS_VGET and VFS_ROOT + calls are done before we let this unmount thread continue the + unmount. (We want to make sure that the unmount will see these + vnode references during its scan.) + */ + + if (newstate == DM_STATE_UNMOUNTING) { + while (fsrp->fr_vfscnt) { + fsrp->fr_unmount++; + sv_wait(&fsrp->fr_queue, 1, &fsrp->fr_lock, lc); + lc = mutex_spinlock(&fsrp->fr_lock); + fsrp->fr_unmount--; + } + } + + mutex_spinunlock(&fsrp->fr_lock, lc); +} + + +/* dm_remove_fsys_entry() gets called after a failed mount or after an + DM_EVENT_UNMOUNT event has been queued. (The filesystem entry must stay + until the DM_EVENT_UNMOUNT reply is queued so that the event can use the + 'fr_sessp' list to see which session to send the event to.) +*/ + +void +dm_remove_fsys_entry( + vfs_t *vfsp) +{ + dm_fsreg_t **fsrpp; + dm_fsreg_t *fsrp; + unsigned long lc; /* lock cookie */ + + /* Find the filesystem referenced by the vfsp's fsid_t and dequeue + it after verifying that the fr_state shows a filesystem that is + either mounting or unmounted. + */ + + lc = mutex_spinlock(&dm_reg_lock); + + fsrpp = &dm_registers; + while ((fsrp = *fsrpp) != NULL) { + if (!bcmp(&fsrp->fr_fsid, vfsp->vfs_altfsid, sizeof(fsrp->fr_fsid))) + break; + fsrpp = &fsrp->fr_next; + } + if (fsrp == NULL) { + mutex_spinunlock(&dm_reg_lock, lc); + panic("dm_remove_fsys_entry: can't find DMAPI fsrp for " + "vfsp %p\n", vfsp); + } + + nested_spinlock(&fsrp->fr_lock); + + /* Verify that it makes sense to remove this entry. */ + + if (fsrp->fr_state != DM_STATE_MOUNTING && + fsrp->fr_state != DM_STATE_UNMOUNTED) { + nested_spinunlock(&fsrp->fr_lock); + mutex_spinunlock(&dm_reg_lock, lc); + panic("dm_remove_fsys_entry: DMAPI sequence error: old state " + "%d, fsrp %p\n", fsrp->fr_state, fsrp); + } + + *fsrpp = fsrp->fr_next; + dm_fsys_cnt--; + + nested_spinunlock(&dm_reg_lock); + + /* Since the filesystem is about to finish unmounting, we must be sure + that no vnodes are being referenced within the filesystem before we + let this event thread continue. If the filesystem is currently in + state DM_STATE_MOUNTING, then we know by definition that there can't + be any references. If the filesystem is DM_STATE_UNMOUNTED, then + any application threads referencing handles with DM_NO_TOKEN should + have already been awakened by dm_change_fsys_entry and should be + long gone by now. Just in case they haven't yet left, sleep here + until they are really gone. + */ + + while (fsrp->fr_hdlcnt) { + fsrp->fr_unmount++; + sv_wait(&fsrp->fr_queue, 1, &fsrp->fr_lock, lc); + lc = mutex_spinlock(&fsrp->fr_lock); + fsrp->fr_unmount--; + } + mutex_spinunlock(&fsrp->fr_lock, lc); + + /* Release all memory. */ + +#ifdef CONFIG_PROC_FS + { + char buf[100]; + sprintf(buf, DMAPI_DBG_PROCFS "/fsreg/0x%p", fsrp); + remove_proc_entry(buf, NULL); + } +#endif + sv_destroy(&fsrp->fr_dispq); + sv_destroy(&fsrp->fr_queue); + spinlock_destroy(&fsrp->fr_lock); + kfree(fsrp->fr_msg); + kmem_cache_free(dm_fsreg_cachep, fsrp); +} + + +/* Get a vnode for the object referenced by handlep. We cannot use + altgetvfs() because it fails if the VFS_OFFLINE bit is set, which means + that any call to dm_handle_to_vp() while a umount is in progress would + return an error, even if the umount can't possibly succeed because users + are in the filesystem. The requests would start to fail as soon as the + umount begins, even before the application receives the DM_EVENT_PREUNMOUNT + event. + + dm_handle_to_vp() emulates the behavior of lookup() while an unmount is + in progress. Any call to dm_handle_to_vp() while the filesystem is in the + DM_STATE_UNMOUNTING state will block. If the unmount eventually succeeds, + the requests will wake up and fail. If the unmount fails, the requests will + wake up and complete normally. + + While a filesystem is in state DM_STATE_MOUNTING, dm_handle_to_vp() will + fail all requests. Per the DMAPI spec, the only handles in the filesystem + which are valid during a mount event are the handles within the event + itself. +*/ + +vnode_t * +dm_handle_to_vp( + xfs_handle_t *handlep, + short *typep) +{ + dm_fsreg_t *fsrp; + vnode_t *vp; + short type; + unsigned long lc; /* lock cookie */ + int error; + fid_t *fidp; + + if ((fsrp = dm_find_fsreg_and_lock((fsid_t*)&handlep->ha_fsid, &lc)) == NULL) + return(NULL); + + fidp = (fid_t*)&handlep->ha_fid; + /* If mounting, and we are not asking for a filesystem handle, + * then fail the request. (fid_len==0 for fshandle) + */ + if ((fsrp->fr_state == DM_STATE_MOUNTING) && + (fidp->fid_len != 0)) { + mutex_spinunlock(&fsrp->fr_lock, lc); + return(NULL); + } + + for (;;) { + if (fsrp->fr_state == DM_STATE_MOUNTING) + break; + if (fsrp->fr_state == DM_STATE_MOUNTED) + break; + if (fsrp->fr_state == DM_STATE_UNMOUNTED) { + if (fsrp->fr_unmount && fsrp->fr_hdlcnt == 0) + sv_broadcast(&fsrp->fr_queue); + mutex_spinunlock(&fsrp->fr_lock, lc); + return(NULL); + } + + /* Must be DM_STATE_UNMOUNTING. */ + + fsrp->fr_hdlcnt++; + sv_wait(&fsrp->fr_queue, 1, &fsrp->fr_lock, lc); + lc = mutex_spinlock(&fsrp->fr_lock); + fsrp->fr_hdlcnt--; + } + + fsrp->fr_vfscnt++; + mutex_spinunlock(&fsrp->fr_lock, lc); + + /* Now that the mutex is released, wait until we have access to the + vnode. + */ + + if (fidp->fid_len == 0) { /* filesystem handle */ + VFS_ROOT(fsrp->fr_vfsp, &vp, error); + } else { /* file object handle */ + VFS_VGET(fsrp->fr_vfsp, &vp, fidp, error); + } + + lc = mutex_spinlock(&fsrp->fr_lock); + + fsrp->fr_vfscnt--; + if (fsrp->fr_unmount && fsrp->fr_vfscnt == 0) + sv_broadcast(&fsrp->fr_queue); + + mutex_spinunlock(&fsrp->fr_lock, lc); + if (error || vp == NULL) + return(NULL); + + if (fidp->fid_len == 0) { + type = DM_TDT_VFS; + } else if (vp->v_type == VREG) { + type = DM_TDT_REG; + } else if (vp->v_type == VDIR) { + type = DM_TDT_DIR; + } else if (vp->v_type == VLNK) { + type = DM_TDT_LNK; + } else { + type = DM_TDT_OTH; + } + *typep = type; + return(vp); +} + + +int +dm_vp_to_handle( + vnode_t *vp, + xfs_handle_t *handlep) +{ + int error; + struct fid fid; + int hsize; + + if (vp->v_vfsp->vfs_altfsid == NULL) + return(EINVAL); + + VOP_FID2(vp, &fid, error); + if (error) + return(error); + + bcopy (vp->v_vfsp->vfs_altfsid, &handlep->ha_fsid, sizeof(fsid_t)); + bcopy(&fid, &handlep->ha_fid, fid.fid_len + sizeof fid.fid_len); + hsize = XFS_HSIZE(*handlep); + bzero ((char *)handlep + hsize, sizeof(*handlep) - hsize); + return(0); +} + + +/* Given a vnode, check if that vnode resides in filesystem that supports + DMAPI. Returns zero if the vnode is in a DMAPI filesystem, otherwise + returns an errno. +*/ + +int +dm_check_dmapi_vp( + vnode_t *vp) +{ + xfs_handle_t handle; + /* REFERENCED */ + dm_fsreg_t *fsrp; + int error; + unsigned long lc; /* lock cookie */ + + if ((error = dm_vp_to_handle(vp, &handle)) != 0) + return(error); + + if ((fsrp = dm_find_fsreg_and_lock((fsid_t*)&handle.ha_fsid, &lc)) == NULL) + return(EBADF); + mutex_spinunlock(&fsrp->fr_lock, lc); + return(0); +} + + +/* Return a pointer to the DM_EVENT_MOUNT event while a mount is still in + progress. This is only called by dm_get_config and dm_get_config_events + which need to access the filesystem during a mount but which don't have + a session and token to use. +*/ + +dm_tokevent_t * +dm_find_mount_tevp_and_lock( + fsid_t *fsidp, + unsigned long *lcp) /* address of returned lock cookie */ +{ + dm_fsreg_t *fsrp; + + if ((fsrp = dm_find_fsreg_and_lock(fsidp, lcp)) == NULL) + return(NULL); + + if (!fsrp->fr_tevp || fsrp->fr_state != DM_STATE_MOUNTING) { + mutex_spinunlock(&fsrp->fr_lock, *lcp); + return(NULL); + } + nested_spinlock(&fsrp->fr_tevp->te_lock); + nested_spinunlock(&fsrp->fr_lock); + return(fsrp->fr_tevp); +} + + +/* Wait interruptibly until a session registers disposition for 'event' in + filesystem 'vfsp'. Upon successful exit, both the filesystem's dm_fsreg_t + structure and the session's dm_session_t structure are locked. The caller + is responsible for unlocking both structures using the returned cookies. + + Warning: The locks can be dropped in any order, but the 'lc2p' cookie MUST + BE USED FOR THE FIRST UNLOCK, and the lc1p cookie must be used for the + second unlock. If this is not done, the CPU will be interruptible while + holding a mutex, which could deadlock the machine! +*/ + +static int +dm_waitfor_disp( + vfs_t *vfsp, + dm_tokevent_t *tevp, + dm_fsreg_t **fsrpp, + unsigned long *lc1p, /* addr of first returned lock cookie */ + dm_session_t **sessionpp, + unsigned long *lc2p) /* addr of 2nd returned lock cookie */ +{ + dm_eventtype_t event = tevp->te_msg.ev_type; + dm_session_t *s; + dm_fsreg_t *fsrp; + + if ((fsrp = dm_find_fsreg_and_lock(vfsp->vfs_altfsid, lc1p)) == NULL) + return(ENOENT); + + /* If no session is registered for this event in the specified + filesystem, then sleep interruptibly until one does. + */ + + for (;;) { + int rc = 0; + + /* The dm_find_session_and_lock() call is needed because a + session that is in the process of being removed might still + be in the dm_fsreg_t structure but won't be in the + dm_sessions list. + */ + + if ((s = fsrp->fr_sessp[event]) != NULL && + dm_find_session_and_lock(s->sn_sessid, &s, lc2p) == 0) { + break; + } + + /* Noone is currently registered. DM_EVENT_UNMOUNT events + don't wait for anyone to register because the unmount is + already past the point of no return. + */ + + if (event == DM_EVENT_UNMOUNT) { + mutex_spinunlock(&fsrp->fr_lock, *lc1p); + return(ENOENT); + } + + /* Wait until a session registers for disposition of this + event. + */ + + fsrp->fr_dispcnt++; + dm_link_event(tevp, &fsrp->fr_evt_dispq); + + sv_wait_sig(&fsrp->fr_dispq, 1, &fsrp->fr_lock, *lc1p); + rc = signal_pending(current); + + *lc1p = mutex_spinlock(&fsrp->fr_lock); + fsrp->fr_dispcnt--; + dm_unlink_event(tevp, &fsrp->fr_evt_dispq); + if (rc) { /* if signal was received */ + mutex_spinunlock(&fsrp->fr_lock, *lc1p); + return(EINTR); + } + } + *sessionpp = s; + *fsrpp = fsrp; + return(0); +} + + +/* Returns the session pointer for the session registered for an event + in the given vfsp. If successful, the session is locked upon return. The + caller is responsible for releasing the lock. If no session is currently + registered for the event, dm_waitfor_disp_session() will sleep interruptibly + until a registration occurs. +*/ + +int +dm_waitfor_disp_session( + vfs_t *vfsp, + dm_tokevent_t *tevp, + dm_session_t **sessionpp, + unsigned long *lcp) +{ + dm_fsreg_t *fsrp; + unsigned long lc2; + int error; + + if (tevp->te_msg.ev_type < 0 || tevp->te_msg.ev_type > DM_EVENT_MAX) + return(EIO); + + error = dm_waitfor_disp(vfsp, tevp, &fsrp, lcp, sessionpp, &lc2); + if (!error) + mutex_spinunlock(&fsrp->fr_lock, lc2); /* rev. cookie order*/ + return(error); +} + + +/* Find the session registered for the DM_EVENT_DESTROY event on the specified + filesystem, sleeping if necessary until registration occurs. Once found, + copy the session's return-on-destroy attribute name, if any, back to the + caller. +*/ + +int +dm_waitfor_destroy_attrname( + vfs_t *vfsp, + dm_attrname_t *attrnamep) +{ + dm_tokevent_t *tevp; + dm_session_t *s; + dm_fsreg_t *fsrp; + int error; + unsigned long lc1; /* first lock cookie */ + unsigned long lc2; /* second lock cookie */ + void *msgp; + + tevp = dm_evt_create_tevp(DM_EVENT_DESTROY, 1, (void**)&msgp); + error = dm_waitfor_disp(vfsp, tevp, &fsrp, &lc1, &s, &lc2); + if (!error) { + *attrnamep = fsrp->fr_rattr; /* attribute or zeros */ + mutex_spinunlock(&s->sn_qlock, lc2); /* rev. cookie order */ + mutex_spinunlock(&fsrp->fr_lock, lc1); + } + dm_evt_rele_tevp(tevp,0); + return(error); +} + + +/* Unregisters the session for the disposition of all events on all + filesystems. This routine is not called until the session has been + dequeued from the session list and its session lock has been dropped, + but before the actual structure is freed, so it is safe to grab the + 'dm_reg_lock' here. If dm_waitfor_disp_session() happens to be called + by another thread, it won't find this session on the session list and + will wait until a new session registers. +*/ + +void +dm_clear_fsreg( + dm_session_t *s) +{ + dm_fsreg_t *fsrp; + int event; + unsigned long lc; /* lock cookie */ + + lc = mutex_spinlock(&dm_reg_lock); + + for (fsrp = dm_registers; fsrp != NULL; fsrp = fsrp->fr_next) { + nested_spinlock(&fsrp->fr_lock); + for (event = 0; event < DM_EVENT_MAX; event++) { + if (fsrp->fr_sessp[event] != s) + continue; + fsrp->fr_sessp[event] = NULL; + if (event == DM_EVENT_DESTROY) + bzero(&fsrp->fr_rattr, sizeof(fsrp->fr_rattr)); + } + nested_spinunlock(&fsrp->fr_lock); + } + + mutex_spinunlock(&dm_reg_lock, lc); +} + + +/* + * Return the handle for the object named by path. + */ + +int +dm_path_to_hdl( + char *path, /* any path name */ + void *hanp, /* user's data buffer */ + size_t *hlenp) /* set to size of data copied */ +{ + /* REFERENCED */ + dm_fsreg_t *fsrp; + xfs_handle_t handle; + vnode_t *vp; + size_t hlen; + int error; + unsigned long lc; /* lock cookie */ + struct nameidata nd; + struct inode *inode; + size_t len; + char *name; + + /* XXX get things straightened out so getname() works here? */ + len = strnlen_user(path, 2000); + name = kmalloc(len, GFP_KERNEL); + if (name == NULL) { + printk("%s/%d: kmalloc returned NULL\n", __FUNCTION__, __LINE__); + return(ENOMEM); + } + if (copy_from_user(name, path, len)) { + kfree(name); + return(EFAULT); + } + + error = 0; + if (path_init(name, LOOKUP_POSITIVE, &nd)) + error = path_walk(name, &nd); + kfree(name); + if (error) + return error; + + ASSERT(nd.dentry); + ASSERT(nd.dentry->d_inode); + inode = igrab(nd.dentry->d_inode); + path_release(&nd); + + if (inode->i_sb->s_magic != XFS_SB_MAGIC) { + /* we're not in XFS anymore, Toto */ + iput(inode); + return EINVAL; + } + + /* we need the vnode */ + vp = LINVFS_GET_VP(inode); + error = dm_vp_to_handle(vp, &handle); + iput(inode); + if (error) + return(error); + + if ((fsrp = dm_find_fsreg_and_lock((fsid_t*)&handle.ha_fsid, &lc)) == NULL) + return(EBADF); + mutex_spinunlock(&fsrp->fr_lock, lc); + + hlen = XFS_HSIZE(handle); + + if (copy_to_user(hanp, &handle, (int)hlen)) + return(EFAULT); + return(put_user(hlen,hlenp)); +} + + +/* + * Return the handle for the file system containing the object named by path. + */ + +int +dm_path_to_fshdl( + char *path, /* any path name */ + void *hanp, /* user's data buffer */ + size_t *hlenp) /* set to size of data copied */ +{ + /* REFERENCED */ + dm_fsreg_t *fsrp; + xfs_handle_t handle; + vnode_t *vp; + size_t hlen; + int error; + unsigned long lc; /* lock cookie */ + struct nameidata nd; + struct inode *inode; + size_t len; + char *name; + + /* XXX get things straightened out so getname() works here? */ + len = strnlen_user(path, 2000); + name = kmalloc(len, GFP_KERNEL); + if (name == NULL) { + printk("%s/%d: kmalloc returned NULL\n", __FUNCTION__, __LINE__); + return(ENOMEM); + } + if (copy_from_user(name, path, len)) { + kfree(name); + return(EFAULT); + } + + error = 0; + if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &nd)) + error = path_walk(name, &nd); + kfree(name); + if (error) + return error; + + ASSERT(nd.dentry); + ASSERT(nd.dentry->d_inode); + + inode = igrab(nd.dentry->d_inode); + path_release(&nd); + + if (inode->i_sb->s_magic != XFS_SB_MAGIC) { + /* we're not in XFS anymore, Toto */ + iput(inode); + return EINVAL; + } + + /* we need the vnode */ + vp = LINVFS_GET_VP(inode); + error = dm_vp_to_handle(vp, &handle); + iput(inode); + + if (error) + return(error); + + if ((fsrp = dm_find_fsreg_and_lock((fsid_t*)&handle.ha_fsid, &lc)) == NULL) + return(EBADF); + mutex_spinunlock(&fsrp->fr_lock, lc); + + hlen = FSHSIZE; + if(copy_to_user(hanp, &handle, (int)hlen)) + return(EFAULT); + return(put_user(hlen,hlenp)); +} + + +int +dm_fd_to_hdl( + int fd, /* any file descriptor */ + void *hanp, /* user's data buffer */ + size_t *hlenp) /* set to size of data copied */ +{ + /* REFERENCED */ + dm_fsreg_t *fsrp; + xfs_handle_t handle; + size_t hlen; + int error; + unsigned long lc; /* lock cookie */ + struct file *filep = fget(fd); + + if (!filep) + return(EBADF); + if ((error = dm_vp_to_handle(LINVFS_GET_VP(filep->f_dentry->d_inode), &handle)) != 0) + return(error); + + if ((fsrp = dm_find_fsreg_and_lock((fsid_t*)&handle.ha_fsid, &lc)) == NULL) + return(EBADF); + mutex_spinunlock(&fsrp->fr_lock, lc); + + hlen = XFS_HSIZE(handle); + if (copy_to_user(hanp, &handle, (int)hlen)) + return(EFAULT); + fput(filep); + return(put_user(hlen, hlenp)); +} + + +/* Enable events on an object. */ + +int +dm_set_eventlist( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_eventset_t *eventsetp, + u_int maxevent) +{ + dm_fsys_vector_t *fsys_vector; + dm_eventset_t eventset; + dm_tokdata_t *tdp; + int error; + + if (copy_from_user(&eventset, eventsetp, sizeof(eventset))) + return(EFAULT); + + /* Do some minor sanity checking. */ + + if (maxevent == 0 || maxevent > DM_EVENT_MAX) + return(EINVAL); + + /* Access the specified object. */ + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_ANY, + DM_RIGHT_EXCL, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->set_eventlist(tdp->td_vp, tdp->td_right, + (tdp->td_type == DM_TDT_VFS ? DM_FSYS_OBJ : 0), + &eventset, maxevent); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + return(error); +} + + +/* Return the list of enabled events for an object. */ + +int +dm_get_eventlist( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + u_int nelem, + dm_eventset_t *eventsetp, + u_int *nelemp) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + dm_eventset_t eventset; + u_int elem; + int error; + + if (nelem == 0) + return(EINVAL); + + /* Access the specified object. */ + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_ANY, + DM_RIGHT_SHARED, &tdp); + if (error != 0) + return(error); + + /* Get the object's event list. */ + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->get_eventlist(tdp->td_vp, tdp->td_right, + (tdp->td_type == DM_TDT_VFS ? DM_FSYS_OBJ : 0), + nelem, &eventset, &elem); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + dm_app_put_tdp(tdp); + + if (error) + return(error); + + if (copy_to_user(eventsetp, &eventset, sizeof(eventset))) + return(EFAULT); + if (put_user(nelem, nelemp)) + return(EFAULT); + return(0); +} + + +/* Register for disposition of events. The handle must either be the + global handle or must be the handle of a file system. The list of events + is pointed to by eventsetp. +*/ + +int +dm_set_disp( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_eventset_t *eventsetp, + u_int maxevent) +{ + dm_session_t *s; + dm_fsreg_t *fsrp; + dm_tokdata_t *tdp; + dm_eventset_t eventset; + int error; + unsigned long lc1; /* first lock cookie */ + unsigned long lc2; /* second lock cookie */ + u_int i; + + /* Copy in and validate the event mask. Only the lower maxevent bits + are meaningful, so clear any bits set above maxevent. + */ + + if (maxevent == 0 || maxevent > DM_EVENT_MAX) + return(EINVAL); + if (copy_from_user(&eventset, eventsetp, sizeof(eventset))) + return(EFAULT); + eventset &= (1 << maxevent) - 1; + + /* If the caller specified the global handle, then the only valid token + is DM_NO_TOKEN, and the only valid event in the event mask is + DM_EVENT_MOUNT. If it is set, add the session to the list of + sessions that want to receive mount events. If it is clear, remove + the session from the list. Since DM_EVENT_MOUNT events never block + waiting for a session to register, there is noone to wake up if we + do add the session to the list. + */ + + if (DM_GLOBALHAN(hanp, hlen)) { + if (token != DM_NO_TOKEN) + return(EINVAL); + if ((error = dm_find_session_and_lock(sid, &s, &lc1)) != 0) + return(error); + if (eventset == 0) { + s->sn_flags &= ~DM_SN_WANTMOUNT; + error = 0; + } else if (eventset == 1 << DM_EVENT_MOUNT) { + s->sn_flags |= DM_SN_WANTMOUNT; + error = 0; + } else { + error = EINVAL; + } + mutex_spinunlock(&s->sn_qlock, lc1); + return(error); + } + + /* Since it's not the global handle, it had better be a filesystem + handle. Verify that the first 'maxevent' events in the event list + are all valid for a filesystem handle. + */ + + if (eventset & ~DM_VALID_DISP_EVENTS) + return(EINVAL); + + /* Verify that the session is valid, that the handle is a filesystem + handle, and that the filesystem is capable of sending events. (If + a dm_fsreg_t structure exists, then the filesystem can issue events.) + */ + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VFS, + DM_RIGHT_EXCL, &tdp); + if (error != 0) + return(error); + + fsrp = dm_find_fsreg_and_lock((fsid_t*)&tdp->td_handle.ha_fsid, &lc1); + if (fsrp == NULL) { + dm_app_put_tdp(tdp); + return(EINVAL); + } + + /* Now that we own 'fsrp->fr_lock', get the lock on the session so that + it can't disappear while we add it to the filesystem's event mask. + */ + + if ((error = dm_find_session_and_lock(sid, &s, &lc2)) != 0) { + mutex_spinunlock(&fsrp->fr_lock, lc1); + dm_app_put_tdp(tdp); + return(error); + } + + /* Update the event disposition array for this filesystem, adding + and/or removing the session as appropriate. If this session is + dropping registration for DM_EVENT_DESTROY, or is overriding some + other session's registration for DM_EVENT_DESTROY, then clear any + any attr-on-destroy attribute name also. + */ + + for (i = 0; i < DM_EVENT_MAX; i++) { + if (DMEV_ISSET(i, eventset)) { + if (i == DM_EVENT_DESTROY && fsrp->fr_sessp[i] != s) + bzero(&fsrp->fr_rattr, sizeof(fsrp->fr_rattr)); + fsrp->fr_sessp[i] = s; + } else if (fsrp->fr_sessp[i] == s) { + if (i == DM_EVENT_DESTROY) + bzero(&fsrp->fr_rattr, sizeof(fsrp->fr_rattr)); + fsrp->fr_sessp[i] = NULL; + } + } + mutex_spinunlock(&s->sn_qlock, lc2); /* reverse cookie order */ + + /* Wake up all processes waiting for a disposition on this filesystem + in case any of them happen to be waiting for an event which we just + added. + */ + + if (fsrp->fr_dispcnt) + sv_broadcast(&fsrp->fr_dispq); + + mutex_spinunlock(&fsrp->fr_lock, lc1); + + dm_app_put_tdp(tdp); + return(0); +} + + +/* + * Register a specific attribute name with a filesystem. The value of + * the attribute is to be returned with an asynchronous destroy event. + */ + +int +dm_set_return_on_destroy( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_attrname_t *attrnamep, + dm_boolean_t enable) +{ + dm_attrname_t attrname; + dm_tokdata_t *tdp; + dm_fsreg_t *fsrp; + dm_session_t *s; + int error; + unsigned long lc1; /* first lock cookie */ + unsigned long lc2; /* second lock cookie */ + + /* If a dm_attrname_t is provided, copy it in and validate it. */ + + if (enable && (error = copy_from_user(&attrname, attrnamep, sizeof(attrname))) != 0) + return(error); + + /* Validate the filesystem handle and use it to get the filesystem's + disposition structure. + */ + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VFS, + DM_RIGHT_EXCL, &tdp); + if (error != 0) + return(error); + + fsrp = dm_find_fsreg_and_lock((fsid_t*)&tdp->td_handle.ha_fsid, &lc1); + if (fsrp == NULL) { + dm_app_put_tdp(tdp); + return(EINVAL); + } + + /* Now that we own 'fsrp->fr_lock', get the lock on the session so that + it can't disappear while we add it to the filesystem's event mask. + */ + + if ((error = dm_find_session_and_lock(sid, &s, &lc2)) != 0) { + mutex_spinunlock(&fsrp->fr_lock, lc1); + dm_app_put_tdp(tdp); + return(error); + } + + /* A caller cannot disable return-on-destroy if he is not registered + for DM_EVENT_DESTROY. Enabling return-on-destroy is an implicit + dm_set_disp() for DM_EVENT_DESTROY; we wake up all processes + waiting for a disposition in case any was waiting for a + DM_EVENT_DESTROY event. + */ + + error = 0; + if (enable) { + fsrp->fr_sessp[DM_EVENT_DESTROY] = s; + fsrp->fr_rattr = attrname; + if (fsrp->fr_dispcnt) + sv_broadcast(&fsrp->fr_dispq); + } else if (fsrp->fr_sessp[DM_EVENT_DESTROY] != s) { + error = EINVAL; + } else { + bzero(&fsrp->fr_rattr, sizeof(fsrp->fr_rattr)); + } + mutex_spinunlock(&s->sn_qlock, lc2); /* reverse cookie order */ + mutex_spinunlock(&fsrp->fr_lock, lc1); + dm_app_put_tdp(tdp); + return(error); +} + + +int +dm_get_mountinfo( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + size_t buflen, + void *bufp, + size_t *rlenp) +{ + dm_fsreg_t *fsrp; + dm_tokdata_t *tdp; + int error; + unsigned long lc; /* lock cookie */ + + /* Make sure that the caller's buffer is 8-byte aligned. */ + + if (((__psint_t)bufp & (sizeof(__u64) - 1)) != 0) + return(EFAULT); + + /* Verify that the handle is a filesystem handle, and that the + filesystem is capable of sending events. If not, return an error. + */ + + error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VFS, + DM_RIGHT_SHARED, &tdp); + if (error != 0) + return(error); + + /* Find the filesystem entry. This should always succeed as the + dm_app_get_tdp call created a filesystem reference. Once we find + the entry, drop the lock. The mountinfo message is never modified, + the filesystem entry can't disappear, and we don't want to hold a + spinlock while doing copyout calls. + */ + + fsrp = dm_find_fsreg_and_lock((fsid_t*)&tdp->td_handle.ha_fsid, &lc); + if (fsrp == NULL) { + dm_app_put_tdp(tdp); + return(EINVAL); + } + mutex_spinunlock(&fsrp->fr_lock, lc); + + /* Copy the message into the user's buffer and update his 'rlenp'. */ + + if (put_user(fsrp->fr_msgsize, rlenp)) { + error = EFAULT; + } else if (fsrp->fr_msgsize > buflen) { /* user buffer not big enough */ + error = E2BIG; + } else if (copy_to_user(bufp, fsrp->fr_msg, fsrp->fr_msgsize)) { + error = EFAULT; + } else { + error = 0; + } + dm_app_put_tdp(tdp); + return(error); +} + + +int +dm_getall_disp( + dm_sessid_t sid, + size_t buflen, + void *bufp, + size_t *rlenp) +{ + dm_session_t *s; /* pointer to session given by sid */ + unsigned long lc1; /* first lock cookie */ + unsigned long lc2; /* second lock cookie */ + int totalsize; + int msgsize; + int fsyscnt; + dm_dispinfo_t *prevmsg; + dm_fsreg_t *fsrp; + int error; + char *kbuf; + + int tmp3; + int tmp4; + + /* Because the dm_getall_disp structure contains a __u64 field, + make sure that the buffer provided by the caller is aligned so + that he can read such fields successfully. + */ + + if (((__psint_t)bufp & (sizeof(__u64) - 1)) != 0) + return(EFAULT); + + /* Compute the size of a dm_dispinfo structure, rounding up to an + 8-byte boundary so that any subsequent structures will also be + aligned. + */ + +#if 0 + /* XXX ug, what is going on here? */ + msgsize = (sizeof(dm_dispinfo_t) + FSHSIZE + sizeof(uint64_t) - 1) & + ~(sizeof(uint64_t) - 1); +#else + tmp3 = sizeof(dm_dispinfo_t) + FSHSIZE; + tmp3 += sizeof(__u64); + tmp3 -= 1; + tmp4 = ~(sizeof(__u64) - 1); + msgsize = tmp3 & tmp4; +#endif + + /* Loop until we can get the right amount of temp space, being careful + not to hold a mutex during the allocation. Usually only one trip. + */ + + for (;;) { + if ((fsyscnt = dm_fsys_cnt) == 0) { + /*if (dm_cpoutsizet(rlenp, 0))*/ + if (put_user(0,rlenp)) + return(EFAULT); + return(0); + } + kbuf = kmalloc(fsyscnt * msgsize, GFP_KERNEL); + if (kbuf == NULL) { + printk("%s/%d: kmalloc returned NULL\n", __FUNCTION__, __LINE__); + return ENOMEM; + } + + lc1 = mutex_spinlock(&dm_reg_lock); + if (fsyscnt == dm_fsys_cnt) + break; + + mutex_spinunlock(&dm_reg_lock, lc1); + kfree(kbuf); + } + + /* Find the indicated session and lock it. */ + + if ((error = dm_find_session_and_lock(sid, &s, &lc2)) != 0) { + mutex_spinunlock(&dm_reg_lock, lc1); + kfree(kbuf); + return(error); + } + + /* Create a dm_dispinfo structure for each filesystem in which + this session has at least one event selected for disposition. + */ + + totalsize = 0; /* total bytes to transfer to the user */ + prevmsg = NULL; + + for (fsrp = dm_registers; fsrp; fsrp = fsrp->fr_next) { + dm_dispinfo_t *disp; + int event; + int found; + + disp = (dm_dispinfo_t *)(kbuf + totalsize); + + DMEV_ZERO(disp->di_eventset); + + for (event = 0, found = 0; event < DM_EVENT_MAX; event++) { + if (fsrp->fr_sessp[event] != s) + continue; + DMEV_SET(event, disp->di_eventset); + found++; + } + if (!found) + continue; + + disp->_link = 0; + disp->di_fshandle.vd_offset = sizeof(dm_dispinfo_t); + disp->di_fshandle.vd_length = FSHSIZE; + + bcopy(&fsrp->fr_fsid, + (char *)disp + disp->di_fshandle.vd_offset, + disp->di_fshandle.vd_length); + + if (prevmsg) + prevmsg->_link = msgsize; + + prevmsg = disp; + totalsize += msgsize; + } + mutex_spinunlock(&s->sn_qlock, lc2); /* reverse cookie order */ + mutex_spinunlock(&dm_reg_lock, lc1); + + if (put_user(totalsize, rlenp)) { + error = EFAULT; + } else if (totalsize > buflen) { /* no more room */ + error = E2BIG; + } else if (totalsize && copy_to_user(bufp, kbuf, totalsize)) { + error = EFAULT; + } else { + error = 0; + } + + kfree(kbuf); + return(error); +} + +int +dm_open_by_handle_rvp( + unsigned int fd, + void *hanp, + size_t hlen, + int flags, + int *rvp) +{ + xfs_handle_t handle; + int error; + vnode_t *vp; + short td_type; + struct dentry *dentry; + struct inode *inodep; + int new_fd; + struct file *mfilp; + struct file *filp; + struct list_head *lp; + + if ((error = dm_copyin_handle(hanp, hlen, &handle)) != 0) { + return(error); + } + + if ((vp = dm_handle_to_vp(&handle, &td_type)) == NULL) { + return(EBADF); + } + inodep = LINVFS_GET_IP(vp); + if ((td_type == DM_TDT_VFS) || (td_type == DM_TDT_OTH)) { + iput(inodep); + return(EBADF); + } + + if ((new_fd = get_unused_fd()) < 0) { + iput(inodep); + return(EMFILE); + } + + /* Now to find a dentry. If possible, get a well-connected one. */ + spin_lock(&dcache_lock); + for (lp = inodep->i_dentry.next; lp != &inodep->i_dentry ; lp=lp->next) { + dentry = list_entry(lp,struct dentry, d_alias); + if (! (dentry->d_flags & DCACHE_NFSD_DISCONNECTED)) { + dget_locked(dentry); + dentry->d_vfs_flags |= DCACHE_REFERENCED; + spin_unlock(&dcache_lock); + iput(inodep); + goto found; + } + } + spin_unlock(&dcache_lock); + + /* ELSE didn't find dentry. Create anonymous dcache entry. */ + dentry = d_alloc_root(inodep); + if (dentry == NULL) { + iput(inodep); + put_unused_fd(new_fd); + return(ENOMEM); + } + /* keep nfsd happy. */ + dentry->d_flags |= DCACHE_NFSD_DISCONNECTED; + +found: + + if( inodep->i_ino != dentry->d_inode->i_ino ){ + dput(dentry); + put_unused_fd(new_fd); + return(EINVAL); + } + + mfilp = fget(fd); + if (!mfilp) { + dput(dentry); + put_unused_fd(new_fd); + return(EBADF); + } + + mntget(mfilp->f_vfsmnt); + + /* Create file pointer */ + filp = dentry_open(dentry, mfilp->f_vfsmnt, flags); + if (IS_ERR(filp)) { + put_unused_fd(new_fd); + fput(mfilp); + return -PTR_ERR(filp); + } + + if (td_type == DM_TDT_REG) + filp->f_mode |= FINVIS; + fd_install(new_fd, filp); + fput(mfilp); + *rvp = new_fd; + return 0; +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/dmapi/dmapi_right.c linux-2.4-xfs/fs/xfs/dmapi/dmapi_right.c --- linux-2.4.19/fs/xfs/dmapi/dmapi_right.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/dmapi/dmapi_right.c Sat Aug 24 17:08:30 2002 @@ -0,0 +1,1268 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "dmapi_private.h" + + +#define DM_FG_STHREAD 0x001 /* keep other threads from using tdp */ +#define DM_FG_MUSTEXIST 0x002 /* handle must exist in the event */ +#define DM_FG_DONTADD 0x004 /* don't add handle if not in event */ + +/* Get a handle of the form (void *, size_t) from user space and convert it to + a handle_t. Do as much validation of the result as possible; any error + other than a bad address should return EBADF per the DMAPI spec. +*/ + +int +dm_copyin_handle( + void *hanp, /* input, handle data */ + size_t hlen, /* input, size of handle data */ + xfs_handle_t *handlep) /* output, copy of data */ +{ + u_short len; + fid_t *fidp; + + fidp = (fid_t*)&handlep->ha_fid; + + if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep)) + return(EBADF); + + if (copy_from_user(handlep, hanp, hlen)) + return(EFAULT); + + if (hlen < sizeof(*handlep)) + bzero((char *)handlep + hlen, sizeof(*handlep) - hlen); + + if (hlen == sizeof(handlep->ha_fsid)) + return(0); /* FS handle, nothing more to check */ + + len = hlen - sizeof(handlep->ha_fsid) - sizeof(fidp->fid_len); + + if (fidp->fid_len != len || + *((short *) fidp->fid_data)) { + return(EBADF); + } + return(0); +} + +/* Allocate and initialize a tevp structure. Called from both application and + event threads. +*/ + +static dm_tokevent_t * +dm_init_tevp( + int ev_size, /* size of event structure */ + int var_size) /* size of variable-length data */ +{ + dm_tokevent_t *tevp; + int msgsize; + + /* Calculate the size of the event in bytes and allocate memory for it. + Zero all but the variable portion of the message, which will be + eventually overlaid by the caller with data. + */ + + msgsize = offsetof(dm_tokevent_t, te_event) + ev_size + var_size; + tevp = kmalloc(msgsize, GFP_KERNEL); + if (tevp == NULL) { + printk("%s/%d: kmalloc returned NULL\n", __FUNCTION__, __LINE__); + return NULL; + } + bzero(tevp, msgsize - var_size); + + /* Now initialize all the non-zero fields. */ + + spinlock_init(&tevp->te_lock, "te_lock"); + sv_init(&tevp->te_evt_queue, SV_DEFAULT, "te_evt_queue"); + sv_init(&tevp->te_app_queue, SV_DEFAULT, "te_app_queue"); + tevp->te_allocsize = msgsize; + tevp->te_msg.ev_type = DM_EVENT_INVALID; + tevp->te_flags = 0; + + return(tevp); +} + + +/* Given the event type and the number of bytes of variable length data that + will follow the event, dm_evt_create_tevp() creates a dm_tokevent_t + structure to hold the event and initializes all the common event fields. + + No locking is required for this routine because the caller is an event + thread, and is therefore the only thread that can see the event. +*/ + +dm_tokevent_t * +dm_evt_create_tevp( + dm_eventtype_t event, + int variable_size, + void **msgpp) +{ + dm_tokevent_t *tevp; + int evsize; + + switch (event) { + case DM_EVENT_READ: + case DM_EVENT_WRITE: + case DM_EVENT_TRUNCATE: + evsize = sizeof(dm_data_event_t); + break; + + case DM_EVENT_DESTROY: + evsize = sizeof(dm_destroy_event_t); + break; + + case DM_EVENT_MOUNT: + evsize = sizeof(dm_mount_event_t); + break; + + case DM_EVENT_PREUNMOUNT: + case DM_EVENT_UNMOUNT: + case DM_EVENT_NOSPACE: + case DM_EVENT_CREATE: + case DM_EVENT_REMOVE: + case DM_EVENT_RENAME: + case DM_EVENT_SYMLINK: + case DM_EVENT_LINK: + case DM_EVENT_POSTCREATE: + case DM_EVENT_POSTREMOVE: + case DM_EVENT_POSTRENAME: + case DM_EVENT_POSTSYMLINK: + case DM_EVENT_POSTLINK: + case DM_EVENT_ATTRIBUTE: + case DM_EVENT_DEBUT: /* currently not supported */ + case DM_EVENT_CLOSE: /* currently not supported */ + evsize = sizeof(dm_namesp_event_t); + break; + + case DM_EVENT_CANCEL: /* currently not supported */ + evsize = sizeof(dm_cancel_event_t); + break; + + case DM_EVENT_USER: + evsize = 0; + break; + + default: + panic("dm_create_tevp: called with unknown event type %d\n", + event); + } + + /* Allocate and initialize an event structure of the correct size. */ + + tevp = dm_init_tevp(evsize, variable_size); + if (tevp == NULL) + return NULL; + tevp->te_evt_ref = 1; + + /* Fields ev_token, ev_sequence, and _link are all filled in when the + event is queued onto a session. Initialize all other fields here. + */ + + tevp->te_msg.ev_type = event; + tevp->te_msg.ev_data.vd_offset = offsetof(dm_tokevent_t, te_event) - + offsetof(dm_tokevent_t, te_msg); + tevp->te_msg.ev_data.vd_length = evsize + variable_size; + + /* Give the caller a pointer to the event-specific structure. */ + + *msgpp = ((char *)&tevp->te_msg + tevp->te_msg.ev_data.vd_offset); + return(tevp); +} + + +/* Given a pointer to an event (tevp) and a pointer to a handle_t, look for a + tdp structure within the event which contains the handle_t. Either verify + that the event contains the tdp, or optionally add the tdp to the + event. Called only from application threads. + + On entry, tevp->te_lock is held; it is dropped prior to return. +*/ + +static int +dm_app_lookup_tdp( + xfs_handle_t *handlep, /* the handle we are looking for */ + dm_tokevent_t *tevp, /* the event to search for the handle */ + unsigned long *lcp, /* address of active lock cookie */ + short types, /* acceptable object types */ + dm_right_t right, /* minimum right the object must have */ + u_int flags, + dm_tokdata_t **tdpp) /* if ! NULL, pointer to matching tdp */ +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + vnode_t *vp; + int error; + + /* Bump the tevp application reference counter so that the event + can't disappear in case we have to drop the lock for a while. + */ + + tevp->te_app_ref++; + *tdpp = NULL; /* assume failure */ + + for (;;) { + /* Look for a matching tdp in the tevp. */ + + for (tdp = tevp->te_tdp; tdp; tdp = tdp->td_next) { + if (XFS_HANDLE_CMP(&tdp->td_handle, handlep) == 0) + break; + } + + /* If the tdp exists, but either we need single-thread access + to the handle and can't get it, or some other thread already + has single-thread access, then sleep until we can try again. + */ + + if (tdp != NULL && tdp->td_app_ref && + ((flags & DM_FG_STHREAD) || + (tdp->td_flags & DM_TDF_STHREAD))) { + tevp->te_app_slp++; + sv_wait(&tevp->te_app_queue, 1, + &tevp->te_lock, *lcp); + *lcp = mutex_spinlock(&tevp->te_lock); + tevp->te_app_slp--; + continue; + } + + if (tdp != NULL && + (tdp->td_vcount > 0 || tdp->td_flags & DM_TDF_EVTREF)) { + /* We have an existing tdp with a non-zero vnode + reference count. If it's the wrong type, return + an appropriate errno. + */ + + if (!(tdp->td_type & types)) { + mutex_spinunlock(&tevp->te_lock, *lcp); + dm_put_tevp(tevp, NULL); /* no destroy events */ + return(EOPNOTSUPP); + } + + /* If the current access right isn't high enough, + complain. + */ + + if (tdp->td_right < right) { + mutex_spinunlock(&tevp->te_lock, *lcp); + dm_put_tevp(tevp, NULL); /* no destroy events */ + return(EACCES); + } + + /* The handle is acceptable. Increment the tdp + application and vnode references and mark the tdp + as single-threaded if necessary. + */ + + tdp->td_app_ref++; + if (flags & DM_FG_STHREAD) + tdp->td_flags |= DM_TDF_STHREAD; + tdp->td_vcount++; + + fsys_vector = dm_fsys_vector(tdp->td_vp); + (void)fsys_vector->obj_ref_hold(tdp->td_vp); + + mutex_spinunlock(&tevp->te_lock, *lcp); + *tdpp = tdp; + return(0); + } + + /* If the tdp is not in the tevp or does not have a vnode + reference, check to make sure it is okay to add/update it. + */ + + if (flags & DM_FG_MUSTEXIST) { + mutex_spinunlock(&tevp->te_lock, *lcp); + dm_put_tevp(tevp, NULL); /* no destroy events */ + return(EACCES); /* i.e. an insufficient right */ + } + if (flags & DM_FG_DONTADD) { + tevp->te_app_ref--; + mutex_spinunlock(&tevp->te_lock, *lcp); + return(0); + } + + /* If a tdp structure doesn't yet exist, create one and link + it into the tevp. Drop the lock while we are doing this as + zallocs can go to sleep. Once we have the memory, make + sure that another thread didn't simultaneously add the same + handle to the same event. If so, toss ours and start over. + */ + + if (tdp == NULL) { + dm_tokdata_t *tmp; + + mutex_spinunlock(&tevp->te_lock, *lcp); + + tdp = kmem_cache_alloc(dm_tokdata_cachep, SLAB_KERNEL); + if (tdp == NULL){ + printk("%s/%d: kmem_cache_alloc(dm_tokdata_cachep) returned NULL\n", __FUNCTION__, __LINE__); + return(ENOMEM); + } + memset(tdp, 0, sizeof(*tdp)); + + *lcp = mutex_spinlock(&tevp->te_lock); + + for (tmp = tevp->te_tdp; tmp; tmp = tmp->td_next) { + if (XFS_HANDLE_CMP(&tmp->td_handle, handlep) == 0) + break; + } + if (tmp) { + kmem_cache_free(dm_tokdata_cachep, tdp); + continue; + } + + tdp->td_next = tevp->te_tdp; + tevp->te_tdp = tdp; + tdp->td_tevp = tevp; + tdp->td_handle = *handlep; + } + + /* Temporarily single-thread access to the tdp so that other + threads don't touch it while we are filling the rest of the + fields in. + */ + + tdp->td_app_ref = 1; + tdp->td_flags |= DM_TDF_STHREAD; + + /* Drop the spinlock while we access, validate, and obtain the + proper rights to the object. This can take a very long time + if the vnode is not in memory, if the filesystem is + unmounting, or if the request_right() call should block + because some other tdp or kernel thread is holding a right. + */ + + mutex_spinunlock(&tevp->te_lock, *lcp); + + if ((vp = dm_handle_to_vp(handlep, &tdp->td_type)) == NULL) { + error = EBADF; + } else { + tdp->td_vcount = 1; + tdp->td_vp = vp; + + /* The handle is usable. Check that the type of the + object matches one of the types that the caller + will accept. + */ + + if (!(types & tdp->td_type)) { + error = EOPNOTSUPP; + } else if (right > DM_RIGHT_NULL) { + /* Attempt to get the rights required by the + caller. If rights can't be obtained, return + an error. + */ + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->request_right(tdp->td_vp, + DM_RIGHT_NULL, + (tdp->td_type == DM_TDT_VFS ? + DM_FSYS_OBJ : 0), + DM_RR_WAIT, right); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + if (!error) { + tdp->td_right = right; + } + } else { + error = 0; + } + } + if (error != 0) { + dm_put_tevp(tevp, tdp); /* destroy event risk, although tiny */ + return(error); + } + + *lcp = mutex_spinlock(&tevp->te_lock); + + /* Wake up any threads which may have seen our tdp while we + were filling it in. + */ + + if (!(flags & DM_FG_STHREAD)) { + tdp->td_flags &= ~DM_TDF_STHREAD; + if (tevp->te_app_slp) + sv_broadcast(&tevp->te_app_queue); + } + + mutex_spinunlock(&tevp->te_lock, *lcp); + *tdpp = tdp; + return(0); + } +} + + +/* dm_app_get_tdp_by_token() is called whenever the application request + contains a session ID and contains a token other than DM_NO_TOKEN. + Most of the callers provide a right that is either DM_RIGHT_SHARED or + DM_RIGHT_EXCL, but a few of the callers such as dm_obj_ref_hold() may + specify a right of DM_RIGHT_NULL. +*/ + +static int +dm_app_get_tdp_by_token( + dm_sessid_t sid, /* an existing session ID */ + void *hanp, + size_t hlen, + dm_token_t token, /* an existing token */ + short types, /* acceptable object types */ + dm_right_t right, /* minimum right the object must have */ + u_int flags, + dm_tokdata_t **tdpp) +{ + dm_tokevent_t *tevp; + xfs_handle_t handle; + int error; + unsigned long lc; /* lock cookie */ + + if (right < DM_RIGHT_NULL || right > DM_RIGHT_EXCL) + return(EINVAL); + + if ((error = dm_copyin_handle(hanp, hlen, &handle)) != 0) + return(error); + + /* Find and lock the event which corresponds to the specified + session/token pair. + */ + + if ((error = dm_find_msg_and_lock(sid, token, &tevp, &lc)) != 0) + return(error); + + return(dm_app_lookup_tdp(&handle, tevp, &lc, types, + right, flags, tdpp)); +} + + +/* Function dm_app_get_tdp() must ONLY be called from routines associated with + application calls, e.g. dm_read_invis, dm_set_disp, etc. It must not be + called by a thread responsible for generating an event such as + dm_send_data_event()! + + dm_app_get_tdp() is the interface used by all application calls other than + dm_get_events, dm_respond_event, dm_get_config, dm_get_config_events, and by + the dm_obj_ref_* and dm_*_right families of requests. + + dm_app_get_tdp() converts a sid/hanp/hlen/token quad into a tdp pointer, + increments the number of active application threads in the event, and + increments the number of active application threads using the tdp. The + 'right' parameter must be either DM_RIGHT_SHARED or DM_RIGHT_EXCL. The + token may either be DM_NO_TOKEN, or can be a token received in a synchronous + event. +*/ + +int +dm_app_get_tdp( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + short types, + dm_right_t right, /* minimum right */ + dm_tokdata_t **tdpp) +{ + dm_session_t *s; + xfs_handle_t handle; + dm_tokevent_t *tevp; + int error; + unsigned long lc; /* lock cookie */ + + ASSERT(right >= DM_RIGHT_SHARED); + + /* If a token other than DM_NO_TOKEN is specified, find the event on + this session which owns the token and increment its reference count. + */ + + if (token != DM_NO_TOKEN) { /* look up existing tokevent struct */ + return(dm_app_get_tdp_by_token(sid, hanp, hlen, token, types, + right, DM_FG_MUSTEXIST, tdpp)); + } + + /* The token is DM_NO_TOKEN. In this case we only want to verify that + the session ID is valid, and do not need to continue holding the + session lock after we know that to be true. + */ + + if ((error = dm_copyin_handle(hanp, hlen, &handle)) != 0) + return(error); + + if ((error = dm_find_session_and_lock(sid, &s, &lc)) != 0) + return(error); + mutex_spinunlock(&s->sn_qlock, lc); + + /* When DM_NO_TOKEN is used, we simply block until we can obtain the + right that we want (since the tevp contains no tdp structures). + The blocking when we eventually support it will occur within + fsys_vector->request_right(). + */ + + tevp = dm_init_tevp(0, 0); + lc = mutex_spinlock(&tevp->te_lock); + + return(dm_app_lookup_tdp(&handle, tevp, &lc, types, right, 0, tdpp)); +} + + +/* dm_get_config_tdp() is only called by dm_get_config() and + dm_get_config_events(), which neither have a session ID nor a token. + Both of these calls are supposed to work even if the filesystem is in the + process of being mounted, as long as the caller only uses handles within + the mount event. +*/ + +int +dm_get_config_tdp( + void *hanp, + size_t hlen, + dm_tokdata_t **tdpp) +{ + xfs_handle_t handle; + dm_tokevent_t *tevp; + int error; + unsigned long lc; /* lock cookie */ + + if ((error = dm_copyin_handle(hanp, hlen, &handle)) != 0) + return(error); + + tevp = dm_init_tevp(0, 0); + lc = mutex_spinlock(&tevp->te_lock); + + /* Try to use the handle provided by the caller and assume DM_NO_TOKEN. + This will fail if the filesystem is in the process of being mounted. + */ + + error = dm_app_lookup_tdp(&handle, tevp, &lc, DM_TDT_ANY, + DM_RIGHT_NULL, 0, tdpp); + + if (!error) { + return(0); + } + + /* Perhaps the filesystem is still mounting, in which case we need to + see if this is one of the handles in the DM_EVENT_MOUNT tevp. + */ + + if ((tevp = dm_find_mount_tevp_and_lock((fsid_t*)&handle.ha_fsid, &lc)) == NULL) + return(EBADF); + + return(dm_app_lookup_tdp(&handle, tevp, &lc, DM_TDT_ANY, + DM_RIGHT_NULL, DM_FG_MUSTEXIST, tdpp)); +} + + +/* dm_put_tdp() is called to release any right held on the vnode, and to + VN_RELE() all references held on the vnode. It is the caller's + responsibility to ensure that no other application threads are using the + tdp, and if necessary to unlink the tdp from the tevp before calling + this routine and to free the tdp afterwards. +*/ + +static void +dm_put_tdp( + dm_tokdata_t *tdp) +{ + ASSERT(tdp->td_app_ref <= 1); + + /* If the application thread is holding a right, or if the event + thread had a right but it has disappeared because of a dm_pending + or Cntl-C, then we need to release it here. + */ + + if (tdp->td_right != DM_RIGHT_NULL) { + dm_fsys_vector_t *fsys_vector; + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + (void)fsys_vector->release_right(tdp->td_vp, tdp->td_right, + (tdp->td_type == DM_TDT_VFS ? DM_FSYS_OBJ : 0)); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + tdp->td_right = DM_RIGHT_NULL; + } + + /* Given that we wouldn't be here if there was still an event thread, + this VN_RELE loop has the potential of generating a DM_EVENT_DESTROY + event if some other thread has unlinked the file. + */ + + while (tdp->td_vcount > 0) { + iput(LINVFS_GET_IP(tdp->td_vp)); + tdp->td_vcount--; + } + + tdp->td_flags &= ~(DM_TDF_HOLD|DM_TDF_RIGHT); + tdp->td_vp = NULL; +} + + +/* Function dm_put_tevp() must ONLY be called from routines associated with + application threads, e.g. dm_read_invis, dm_get_events, etc. It must not be + called by a thread responsible for generating an event, such as + dm_send_data_event. + + PLEASE NOTE: It is possible for this routine to generate DM_EVENT_DESTROY + events, because its calls to dm_put_tdp drop vnode references, and another + thread may have already unlinked a file whose vnode we are de-referencing. + This sets the stage for various types of deadlock if the thread calling + dm_put_tevp is the same thread that calls dm_respond_event! In particular, + the dm_sent_destroy_event routine needs to obtain the dm_reg_lock, + dm_session_lock, and sn_qlock in order to queue the destroy event. No + caller of dm_put_tevp can hold any of these locks! + + Other possible deadlocks are that dm_send_destroy_event could block waiting + for a thread to register for the event using dm_set_disp() and/or + dm_set_return_on_destroy, or it could block because the session's sn_newq + is at the dm_max_queued_msgs event limit. The only safe solution + (unimplemented) is to have a separate kernel thread for each filesystem + whose only job is to do the vnode-dereferencing. That way dm_respond_event + will not block, so the application can keep calling dm_get_events to read + events even if the filesystem thread should block. (If the filesystem + thread blocks, so will all subsequent destroy events for the same + filesystem.) +*/ + +void +dm_put_tevp( + dm_tokevent_t *tevp, + dm_tokdata_t *tdp) +{ + int free_tdp = 0; + unsigned long lc; /* lock cookie */ + + lc = mutex_spinlock(&tevp->te_lock); + + if (tdp != NULL) { + if (tdp->td_vcount > 1 || (tdp->td_flags & DM_TDF_EVTREF)) { + ASSERT(tdp->td_app_ref > 0); + + iput(LINVFS_GET_IP(tdp->td_vp)); + tdp->td_vcount--; + } else { + ASSERT(tdp->td_app_ref == 1); + + /* The vnode reference count is either already at + zero (e.g. a failed dm_handle_to_vp() call in + dm_app_lookup_tdp()) or is going to zero. We can't + hold the lock while we decrement the count because + we could potentially end up being busy for a long + time in VOP_INACTIVATE. Use single-threading to + lock others out while we clean house. + */ + + tdp->td_flags |= DM_TDF_STHREAD; + + /* WARNING - A destroy event is possible here if we are + giving up the last reference on a vnode which has + been previously unlinked by some other thread! + */ + + mutex_spinunlock(&tevp->te_lock, lc); + dm_put_tdp(tdp); + lc = mutex_spinlock(&tevp->te_lock); + + /* If this tdp is not one of the original tdps in the + event, then remove it from the tevp. + */ + + if (!(tdp->td_flags & DM_TDF_ORIG)) { + dm_tokdata_t **tdpp = &tevp->te_tdp; + + while (*tdpp && *tdpp != tdp) { + tdpp = &(*tdpp)->td_next; + } + if (*tdpp == NULL) { + panic("dm_remove_tdp_from_tevp: tdp " + "%p not in tevp %p\n", tdp, + tevp); + } + *tdpp = tdp->td_next; + free_tdp++; + } + } + + /* If this is the last app thread actively using the tdp, clear + any single-threading and wake up any other app threads who + might be waiting to use this tdp, single-threaded or + otherwise. + */ + + if (--tdp->td_app_ref == 0) { + if (tdp->td_flags & DM_TDF_STHREAD) { + tdp->td_flags &= ~DM_TDF_STHREAD; + if (tevp->te_app_slp) + sv_broadcast(&tevp->te_app_queue); + } + } + + if (free_tdp) { + kmem_cache_free(dm_tokdata_cachep, tdp); + } + } + + /* If other application threads are using this token/event, they will + do the cleanup. + */ + + if (--tevp->te_app_ref > 0) { + mutex_spinunlock(&tevp->te_lock, lc); + return; + } + + /* If event generation threads are waiting for this thread to go away, + wake them up and let them do the cleanup. + */ + + if (tevp->te_evt_ref > 0) { + sv_broadcast(&tevp->te_evt_queue); + mutex_spinunlock(&tevp->te_lock, lc); + return; + } + + /* This thread is the last active thread using the token/event. No + lock can be held while we disassemble the tevp because we could + potentially end up being busy for a long time in VOP_INACTIVATE. + */ + + mutex_spinunlock(&tevp->te_lock, lc); + + /* WARNING - One or more destroy events are possible here if we are + giving up references on vnodes which have been previously unlinked + by other kernel threads! + */ + + while ((tdp = tevp->te_tdp) != NULL) { + tevp->te_tdp = tdp->td_next; + dm_put_tdp(tdp); + kmem_cache_free(dm_tokdata_cachep, tdp); + } + spinlock_destroy(&tevp->te_lock); + sv_destroy(&tevp->te_evt_queue); + sv_destroy(&tevp->te_app_queue); + kfree(tevp); +} + + +/* No caller of dm_app_put_tevp can hold either of the locks dm_reg_lock, + dm_session_lock, or any sn_qlock! (See dm_put_tevp for details.) +*/ + +void +dm_app_put_tdp( + dm_tokdata_t *tdp) +{ + dm_put_tevp(tdp->td_tevp, tdp); +} + + +/* dm_change_right is only called if the event thread is the one doing the + cleanup on a completed event. It looks at the current rights of a tdp + and compares that with the rights it had on the tdp when the event was + created. If different, it reaquires the original rights, then transfers + the rights back to being thread-based. +*/ + +static void +dm_change_right( + dm_tokdata_t *tdp) +{ +#ifdef HAVE_DMAPI_RIGHTS + dm_fsys_vector_t *fsys_vector; + int error; + u_int type; +#endif + + /* If the event doesn't have a vnode reference, if the original right + was DM_RIGHT_NULL, or if the rights were never switched from being + thread-based to tdp-based, then there is nothing to do. + */ + + if (!(tdp->td_flags & DM_TDF_EVTREF)) + return; + + if (tdp->td_orig_right == DM_RIGHT_NULL) + return; + + /* DEBUG - Need a check here for event-based rights. */ + +#ifdef HAVE_DMAPI_RIGHTS + /* The "rights" vectors are stubs now anyway. When they are + * implemented then bhv locking will have to be sorted out. + */ + + /* If the current right is not the same as it was when the event was + created, first get back the original right. + */ + + if (tdp->td_right != tdp->td_orig_right) { + fsys_vector = dm_fsys_vector(tdp->td_vp); + type = (tdp->td_type == DM_TDT_VFS ? DM_FSYS_OBJ : 0); + + switch (tdp->td_orig_right) { + case DM_RIGHT_SHARED: + if (tdp->td_right == DM_RIGHT_EXCL) { + error = fsys_vector->downgrade_right( + tdp->td_vp, tdp->td_right, type); + if (!error) + break; + (void)fsys_vector->release_right(tdp->td_vp, + tdp->td_right, type); + } + (void)fsys_vector->request_right(tdp->td_vp, + tdp->td_right, type, DM_RR_WAIT, + tdp->td_orig_right); + break; + + case DM_RIGHT_EXCL: + if (tdp->td_right == DM_RIGHT_SHARED) { + error = fsys_vector->upgrade_right(tdp->td_vp, + tdp->td_right, type); + if (!error) + break; + (void)fsys_vector->release_right(tdp->td_vp, + tdp->td_right, type); + } + (void)fsys_vector->request_right(tdp->td_vp, + tdp->td_right, type, DM_RR_WAIT, + tdp->td_orig_right); + break; + case DM_RIGHT_NULL: + break; + } + } +#endif + + /* We now have back the same level of rights as we had when the event + was generated. Now transfer the rights from being tdp-based back + to thread-based. + */ + + /* DEBUG - Add a call here to transfer rights back to thread-based. */ + + /* Finally, update the tdp so that we don't mess with the rights when + we eventually call dm_put_tdp. + */ + + tdp->td_right = DM_RIGHT_NULL; +} + + +/* This routine is only called by event threads. The calls to dm_put_tdp + are not a deadlock risk here because this is an event thread, and it is + okay for such a thread to block on an induced destroy event. Okay, maybe + there is a slight risk; say that the event contains three vnodes all of + which have DM_RIGHT_EXCL, and say that we are at the dm_max_queued_msgs + limit, and that the first vnode is already unlinked. In that case the + destroy event will block waiting to be queued, and the application thread + could happen to reference one of the other locked vnodes. Deadlock. +*/ + +void +dm_evt_rele_tevp( + dm_tokevent_t *tevp, + int droprights) /* non-zero, evt thread loses rights */ +{ + dm_tokdata_t *tdp; + unsigned long lc; /* lock cookie */ + + lc = mutex_spinlock(&tevp->te_lock); + + /* If we are here without DM_TEF_FINAL set and with at least one + application reference still remaining, then one of several + possibilities is true: + 1. This is an asynchronous event which has been queued but has not + yet been delivered, or which is in the process of being delivered. + 2. This is an unmount event (pseudo-asynchronous) yet to be + delivered or in the process of being delivered. + 3. This event had DM_FLAGS_NDELAY specified, and the application + has sent a dm_pending() reply for the event. + 4. This is a DM_EVENT_READ, DM_EVENT_WRITE, or DM_EVENT_TRUNCATE + event and the user typed a Cntl-C. + In all of these cases, the correct behavior is to leave the + responsibility of releasing any rights to the application threads + when they are done. + */ + + if (tevp->te_app_ref > 0 && !(tevp->te_flags & DM_TEF_FINAL)) { + tevp->te_evt_ref--; + for (tdp = tevp->te_tdp; tdp; tdp = tdp->td_next) { + if (tdp->td_flags & DM_TDF_EVTREF) { + tdp->td_flags &= ~DM_TDF_EVTREF; + if (tdp->td_vcount == 0) { + tdp->td_vp = NULL; + } + } + } + mutex_spinunlock(&tevp->te_lock, lc); + return; /* not the last thread */ + } + + /* If the application reference count is non-zero here, that can only + mean that dm_respond_event() has been called, but the application + still has one or more threads in the kernel that haven't let go of + the tevp. In these cases, the event thread must wait until all + application threads have given up their references, and their + rights to handles within the event. + */ + + while (tevp->te_app_ref) { + sv_wait(&tevp->te_evt_queue, 1, &tevp->te_lock, lc); + lc = mutex_spinlock(&tevp->te_lock); + } + + /* This thread is the last active thread using the token/event. Reset + the rights of any vnode that was part of the original event back + to their initial values before returning to the filesystem. The + exception is if the event failed (droprights is non-zero), in which + case we chose to return to the filesystem with all rights released. + Release the rights on any vnode that was not part of the original + event. Give up all remaining application vnode references + regardless of whether or not the vnode was part of the original + event. + */ + + mutex_spinunlock(&tevp->te_lock, lc); + + while ((tdp = tevp->te_tdp) != NULL) { + tevp->te_tdp = tdp->td_next; + if ((tdp->td_flags & DM_TDF_ORIG) && + (tdp->td_flags & DM_TDF_EVTREF) && + (!droprights)) { + dm_change_right(tdp); + } + dm_put_tdp(tdp); + kmem_cache_free(dm_tokdata_cachep, tdp); + } + spinlock_destroy(&tevp->te_lock); + sv_destroy(&tevp->te_evt_queue); + sv_destroy(&tevp->te_app_queue); + kfree(tevp); +} + + +/* dm_obj_ref_hold() is just a fancy way to get a vnode reference on an object + to hold it in kernel memory. +*/ + +int +dm_obj_ref_hold( + dm_sessid_t sid, + dm_token_t token, + void *hanp, + size_t hlen) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp_by_token(sid, hanp, hlen, token, DM_TDT_VNO, + DM_RIGHT_NULL, DM_FG_STHREAD, &tdp); + + /* The tdp is single-threaded, so no mutex lock needed for update. */ + + if (error == 0) { + if (tdp->td_flags & DM_TDF_HOLD) { /* if already held */ + error = EBUSY; + } else { + tdp->td_flags |= DM_TDF_HOLD; + tdp->td_vcount++; + + fsys_vector = dm_fsys_vector(tdp->td_vp); + (void)fsys_vector->obj_ref_hold(tdp->td_vp); + } + dm_app_put_tdp(tdp); + } + return(error); +} + + +int +dm_obj_ref_rele( + dm_sessid_t sid, + dm_token_t token, + void *hanp, + size_t hlen) +{ + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp_by_token(sid, hanp, hlen, token, DM_TDT_VNO, + DM_RIGHT_NULL, DM_FG_MUSTEXIST|DM_FG_STHREAD, &tdp); + + /* The tdp is single-threaded, so no mutex lock needed for update. */ + + if (error == 0) { + if (!(tdp->td_flags & DM_TDF_HOLD)) { /* if not held */ + error = EACCES; /* use the DM_FG_MUSTEXIST errno */ + } else { + tdp->td_flags &= ~DM_TDF_HOLD; + iput(LINVFS_GET_IP(tdp->td_vp)); + tdp->td_vcount--; + } + dm_app_put_tdp(tdp); + } + return(error); +} + + +int +dm_obj_ref_query_rvp( + dm_sessid_t sid, + dm_token_t token, + void *hanp, + size_t hlen, + int *rvp) +{ + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp_by_token(sid, hanp, hlen, token, DM_TDT_VNO, + DM_RIGHT_NULL, DM_FG_DONTADD|DM_FG_STHREAD, &tdp); + if (error != 0) + return(error); + + /* If the request is valid but the handle just isn't present in the + event or the hold flag isn't set, return zero, else return one. + */ + + if (tdp) { + if (tdp->td_flags & DM_TDF_HOLD) { /* if held */ + *rvp = 1; + } else { + *rvp = 0; + } + dm_app_put_tdp(tdp); + } else { + *rvp = 0; + } + return(0); +} + + +int +dm_downgrade_right( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp_by_token(sid, hanp, hlen, token, DM_TDT_ANY, + DM_RIGHT_EXCL, DM_FG_MUSTEXIST|DM_FG_STHREAD, &tdp); + if (error != 0) + return(error); + + /* Attempt the downgrade. Filesystems which support rights but not + the downgrading of rights will return ENOSYS. + */ + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->downgrade_right(tdp->td_vp, tdp->td_right, + (tdp->td_type == DM_TDT_VFS ? DM_FSYS_OBJ : 0)); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + /* The tdp is single-threaded, so no mutex lock needed for update. */ + + if (error == 0) + tdp->td_right = DM_RIGHT_SHARED; + + dm_app_put_tdp(tdp); + return(error); +} + + +int +dm_query_right( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + dm_right_t *rightp) +{ + dm_tokdata_t *tdp; + dm_right_t right; + int error; + + error = dm_app_get_tdp_by_token(sid, hanp, hlen, token, DM_TDT_ANY, + DM_RIGHT_NULL, DM_FG_DONTADD|DM_FG_STHREAD, &tdp); + if (error != 0) + return(error); + + /* Get the current right and copy it to the caller. The tdp is + single-threaded, so no mutex lock is needed. If the tdp is not in + the event we are supposed to return DM_RIGHT_NULL in order to be + compatible with Veritas. + */ + + if (tdp) { + right = tdp->td_right; + dm_app_put_tdp(tdp); + } else { + right = DM_RIGHT_NULL; + } + if (copy_to_user(rightp, &right, sizeof(right))) + return(EFAULT); + return(0); +} + + +int +dm_release_right( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp_by_token(sid, hanp, hlen, token, DM_TDT_ANY, + DM_RIGHT_SHARED, DM_FG_MUSTEXIST|DM_FG_STHREAD, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->release_right(tdp->td_vp, tdp->td_right, + (tdp->td_type == DM_TDT_VFS ? DM_FSYS_OBJ : 0)); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + /* The tdp is single-threaded, so no mutex lock needed for update. */ + + if (error == 0) { + tdp->td_right = DM_RIGHT_NULL; + if (tdp->td_flags & DM_TDF_RIGHT) { + tdp->td_flags &= ~DM_TDF_RIGHT; + iput(LINVFS_GET_IP(tdp->td_vp)); + tdp->td_vcount--; + } + } + + dm_app_put_tdp(tdp); + return(error); +} + + +int +dm_request_right( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token, + u_int flags, + dm_right_t right) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp_by_token(sid, hanp, hlen, token, DM_TDT_ANY, + DM_RIGHT_NULL, DM_FG_STHREAD, &tdp); + if (error != 0) + return(error); + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->request_right(tdp->td_vp, tdp->td_right, + (tdp->td_type == DM_TDT_VFS ? DM_FSYS_OBJ : 0), flags, right); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + /* The tdp is single-threaded, so no mutex lock is needed for update. + + If this is the first dm_request_right call for this vnode, then we + need to bump the vnode reference count for two reasons. First of + all, it is supposed to be impossible for the file to disappear or + for the filesystem to be unmounted while a right is held on a file; + bumping the file's vnode reference count ensures this. Second, if + rights are ever actually implemented, it will most likely be done + without changes to the on-disk inode, which means that we can't let + the vnode become unreferenced while a right on it is held. + */ + + if (error == 0) { + if (!(tdp->td_flags & DM_TDF_RIGHT)) { /* if first call */ + tdp->td_flags |= DM_TDF_RIGHT; + tdp->td_vcount++; + (void)fsys_vector->obj_ref_hold(tdp->td_vp); + } + tdp->td_right = right; + } + + dm_app_put_tdp(tdp); + return(error); +} + + +int +dm_upgrade_right( + dm_sessid_t sid, + void *hanp, + size_t hlen, + dm_token_t token) +{ + dm_fsys_vector_t *fsys_vector; + dm_tokdata_t *tdp; + int error; + + error = dm_app_get_tdp_by_token(sid, hanp, hlen, token, DM_TDT_ANY, + DM_RIGHT_SHARED, DM_FG_MUSTEXIST|DM_FG_STHREAD, &tdp); + if (error != 0) + return(error); + + /* If the object already has the DM_RIGHT_EXCL right, no need to + attempt an upgrade. + */ + + if (tdp->td_right == DM_RIGHT_EXCL) { + dm_app_put_tdp(tdp); + return(0); + } + + /* Attempt the upgrade. Filesystems which support rights but not + the upgrading of rights will return ENOSYS. + */ + + VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp)); + fsys_vector = dm_fsys_vector(tdp->td_vp); + error = fsys_vector->upgrade_right(tdp->td_vp, tdp->td_right, + (tdp->td_type == DM_TDT_VFS ? DM_FSYS_OBJ : 0)); + VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp)); + + /* The tdp is single-threaded, so no mutex lock needed for update. */ + + if (error == 0) + tdp->td_right = DM_RIGHT_EXCL; + + dm_app_put_tdp(tdp); + return(error); +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/dmapi/dmapi_session.c linux-2.4-xfs/fs/xfs/dmapi/dmapi_session.c --- linux-2.4.19/fs/xfs/dmapi/dmapi_session.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/dmapi/dmapi_session.c Sat Aug 24 17:08:31 2002 @@ -0,0 +1,1539 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include +#include +#include "dmapi_private.h" + +dm_session_t *dm_sessions = NULL; /* head of session list */ +u_int dm_sessions_active = 0; /* # sessions currently active */ +dm_sessid_t dm_next_sessid = 1; /* next session ID to use */ +lock_t dm_session_lock = SPIN_LOCK_UNLOCKED;/* lock for session list */ + +dm_token_t dm_next_token = 1; /* next token ID to use */ +dm_sequence_t dm_next_sequence = 1; /* next sequence number to use */ +lock_t dm_token_lock = SPIN_LOCK_UNLOCKED;/* dm_next_token/dm_next_sequence lock */ + +int dm_max_queued_msgs = 2048; /* max # undelivered msgs/session */ + +#ifdef __sgi +int dm_hash_buckets = 1009; /* prime -- number of buckets */ + +/* XXX floating point not allowed in Linux kernel. */ +#define DM_SHASH(sess,inodenum) ((sess)->sn_sesshash + \ + ((inodenum) % dm_hash_buckets)) +#endif + + +#ifdef CONFIG_PROC_FS +static int +sessions_read_pfs(char *buffer, char **start, off_t offset, + int count, int *eof, void *data) +{ + int len; + dm_session_t *sessp = (dm_session_t*)data; + +#define CHKFULL if(len >= count) break; +#define ADDBUF(a,b) len += sprintf(buffer + len, a, b); CHKFULL; + + len=0; + while(1){ + ADDBUF("sessp=0x%p\n", sessp); + ADDBUF("sn_next=0x%p\n", sessp->sn_next); + ADDBUF("sn_sessid=%d\n", sessp->sn_sessid); + ADDBUF("sn_flags=%x\n", sessp->sn_flags); + ADDBUF("sn_qlock=%c\n", '?'); + ADDBUF("sn_readerq=%c\n", '?'); + ADDBUF("sn_writerq=%c\n", '?'); + ADDBUF("sn_readercnt=%u\n", sessp->sn_readercnt); + ADDBUF("sn_writercnt=%u\n", sessp->sn_writercnt); + + ADDBUF("sn_newq.eq_head=0x%p\n", sessp->sn_newq.eq_head); + ADDBUF("sn_newq.eq_tail=0x%p\n", sessp->sn_newq.eq_tail); + ADDBUF("sn_newq.eq_count=%d\n", sessp->sn_newq.eq_count); + + ADDBUF("sn_delq.eq_head=0x%p\n", sessp->sn_delq.eq_head); + ADDBUF("sn_delq.eq_tail=0x%p\n", sessp->sn_delq.eq_tail); + ADDBUF("sn_delq.eq_count=%d\n", sessp->sn_delq.eq_count); + + ADDBUF("sn_evt_writerq.eq_head=0x%p\n", sessp->sn_evt_writerq.eq_head); + ADDBUF("sn_evt_writerq.eq_tail=0x%p\n", sessp->sn_evt_writerq.eq_tail); + ADDBUF("sn_evt_writerq.eq_count=%d\n", sessp->sn_evt_writerq.eq_count); + + ADDBUF("sn_info=\"%s\"\n", sessp->sn_info); + + break; + } + + if (offset >= len) { + *start = buffer; + *eof = 1; + return 0; + } + *start = buffer + offset; + if ((len -= offset) > count) + return count; + *eof = 1; + + return len; +} +#endif + + +/* Link a session to the end of the session list. New sessions are always + added at the end of the list so that dm_enqueue_mount_event() doesn't + miss a session. The caller must have obtained dm_session_lock before + calling this routine. +*/ + +static void +link_session( + dm_session_t *s) +{ + dm_session_t *tmp; + + if ((tmp = dm_sessions) == NULL) { + dm_sessions = s; + } else { + while (tmp->sn_next != NULL) + tmp = tmp->sn_next; + tmp->sn_next = s; + } + s->sn_next = NULL; + dm_sessions_active++; +} + + +/* Remove a session from the session list. The caller must have obtained + dm_session_lock before calling this routine. unlink_session() should only + be used in situations where the session is known to be on the dm_sessions + list; otherwise it panics. +*/ + +static void +unlink_session( + dm_session_t *s) +{ + dm_session_t *tmp; + + if (dm_sessions == s) { + dm_sessions = dm_sessions->sn_next; + } else { + for (tmp = dm_sessions; tmp; tmp = tmp->sn_next) { + if (tmp->sn_next == s) + break; + } + if (tmp == NULL) { + panic("unlink_session: corrupt DMAPI session list, " + "dm_sessions %p, session %p\n", + dm_sessions, s); + } + tmp->sn_next = s->sn_next; + } + s->sn_next = NULL; + dm_sessions_active--; +} + + +/* Link an event to the end of an event queue. The caller must have obtained + the session's sn_qlock before calling this routine. +*/ + +void +dm_link_event( + dm_tokevent_t *tevp, + dm_eventq_t *queue) +{ + if (queue->eq_tail) { + queue->eq_tail->te_next = tevp; + queue->eq_tail = tevp; + } else { + queue->eq_head = queue->eq_tail = tevp; + } + tevp->te_next = NULL; + queue->eq_count++; +} + + +/* Remove an event from an event queue. The caller must have obtained the + session's sn_qlock before calling this routine. dm_unlink_event() should + only be used in situations where the event is known to be on the queue; + otherwise it panics. +*/ + +void +dm_unlink_event( + dm_tokevent_t *tevp, + dm_eventq_t *queue) +{ + dm_tokevent_t *tmp; + + if (queue->eq_head == tevp) { + queue->eq_head = tevp->te_next; + if (queue->eq_head == NULL) + queue->eq_tail = NULL; + } else { + tmp = queue->eq_head; + while (tmp && tmp->te_next != tevp) + tmp = tmp->te_next; + if (tmp == NULL) { + panic("dm_unlink_event: corrupt DMAPI queue %p, " + "tevp %p\n", queue, tevp); + } + tmp->te_next = tevp->te_next; + if (tmp->te_next == NULL) + queue->eq_tail = tmp; + } + tevp->te_next = NULL; + queue->eq_count--; +} + +/* Link a regular file event to a hash bucket. The caller must have obtained + the session's sn_qlock before calling this routine. + The tokevent must be for a regular file object--DM_TDT_REG. +*/ + +#ifdef __sgi +static void +hash_event( + dm_session_t *s, + dm_tokevent_t *tevp) +{ + dm_sesshash_t *sh; + xfs_ino_t ino; + + if (s->sn_sesshash == NULL) + s->sn_sesshash = kmem_zalloc(dm_hash_buckets * sizeof(dm_sesshash_t), KM_SLEEP); + + ino = ((xfs_fid2_t*)&tevp->te_tdp->td_handle.ha_fid)->fid_ino; + sh = DM_SHASH(s, ino); + +#ifdef DM_SHASH_DEBUG + if (sh->h_next == NULL) { + s->sn_buckets_in_use++; + if (s->sn_buckets_in_use > s->sn_max_buckets_in_use) + s->sn_max_buckets_in_use++; + } + sh->maxlength++; + sh->curlength++; + sh->num_adds++; +#endif + + tevp->te_flags |= DM_TEF_HASHED; + tevp->te_hashnext = sh->h_next; + sh->h_next = tevp; +} +#endif + + +/* Remove a regular file event from a hash bucket. The caller must have + obtained the session's sn_qlock before calling this routine. + The tokevent must be for a regular file object--DM_TDT_REG. +*/ + +#ifdef __sgi +static void +unhash_event( + dm_session_t *s, + dm_tokevent_t *tevp) +{ + dm_sesshash_t *sh; + dm_tokevent_t *tmp; + xfs_ino_t ino; + + if (s->sn_sesshash == NULL) + return; + + ino = ((xfs_fid2_t*)&tevp->te_tdp->td_handle.ha_fid)->fid_ino; + sh = DM_SHASH(s, ino); + + if (sh->h_next == tevp) { + sh->h_next = tevp->te_hashnext; /* leap frog */ + } else { + tmp = sh->h_next; + while (tmp->te_hashnext != tevp) { + tmp = tmp->te_hashnext; + } + tmp->te_hashnext = tevp->te_hashnext; /* leap frog */ + } + tevp->te_hashnext = NULL; + tevp->te_flags &= ~DM_TEF_HASHED; + +#ifdef DM_SHASH_DEBUG + if (sh->h_next == NULL) + s->sn_buckets_in_use--; + sh->curlength--; + sh->num_dels++; +#endif +} +#endif + + +/* Determine if this is a repeat event. The caller MUST be holding + the session lock. + The tokevent must be for a regular file object--DM_TDT_REG. + Returns: + 0 == match not found + 1 == match found +*/ + +#ifdef __sgi +static int +repeated_event( + dm_session_t *s, + dm_tokevent_t *tevp) +{ + dm_sesshash_t *sh; + dm_data_event_t *d_event1; + dm_data_event_t *d_event2; + dm_tokevent_t *tevph; + xfs_ino_t ino1; + xfs_ino_t ino2; + + if ((!s->sn_newq.eq_tail) && (!s->sn_delq.eq_tail)) { + return(0); + } + if (s->sn_sesshash == NULL) { + return(0); + } + + ino1 = ((xfs_fid2_t*)&tevp->te_tdp->td_handle.ha_fid)->fid_ino; + sh = DM_SHASH(s, ino1); + + if (sh->h_next == NULL) { + /* bucket is empty, no match here */ + return(0); + } + + d_event1 = (dm_data_event_t *)((char *)&tevp->te_msg + tevp->te_msg.ev_data.vd_offset); + tevph = sh->h_next; + while (tevph) { + /* find something with the same event type and handle type */ + if ((tevph->te_msg.ev_type == tevp->te_msg.ev_type) && + (tevph->te_tdp->td_type == tevp->te_tdp->td_type)) { + + ino2 = ((xfs_fid2_t*)&tevp->te_tdp->td_handle.ha_fid)->fid_ino; + d_event2 = (dm_data_event_t *)((char *)&tevph->te_msg + tevph->te_msg.ev_data.vd_offset); + + /* If the two events are operating on the same file, + and the same part of that file, then we have a + match. + */ + if ((ino1 == ino2) && + (d_event2->de_offset == d_event1->de_offset) && + (d_event2->de_length == d_event1->de_length)) { + /* found a match */ +#ifdef DM_SHASH_DEBUG + sh->dup_hits++; +#endif + return(1); + } + } + tevph = tevph->te_hashnext; + } + + /* No match found */ + return(0); +} +#endif + + +/* Return a pointer to a session given its session ID, or EINVAL if no session + has the session ID (per the DMAPI spec). The caller must have obtained + dm_session_lock before calling this routine. +*/ + +static int +dm_find_session( + dm_sessid_t sid, + dm_session_t **sessionpp) +{ + dm_session_t *s; + + for (s = dm_sessions; s; s = s->sn_next) { + if (s->sn_sessid == sid) { + *sessionpp = s; + return(0); + } + } + return(EINVAL); +} + + +/* Return a pointer to a locked session given its session ID. '*lcp' is + used to obtain the session's sn_qlock. Caller is responsible for eventually + unlocking it. +*/ + +int +dm_find_session_and_lock( + dm_sessid_t sid, + dm_session_t **sessionpp, + unsigned long *lcp) /* addr of returned lock cookie */ +{ + int error; + + for (;;) { + *lcp = mutex_spinlock(&dm_session_lock); + + if ((error = dm_find_session(sid, sessionpp)) != 0) { + mutex_spinunlock(&dm_session_lock, *lcp); + return(error); + } + if (spin_trylock(&(*sessionpp)->sn_qlock)) { + nested_spinunlock(&dm_session_lock); + return(0); /* success */ + } + + /* If the second lock is not available, drop the first and + start over. This gives the CPU a chance to process any + interrupts, and also allows processes which want a sn_qlock + for a different session to proceed. + */ + + mutex_spinunlock(&dm_session_lock, *lcp); + } +} + + +/* Return a pointer to the event on the specified session's sn_delq which + contains the given token. The caller must have obtained the session's + sn_qlock before calling this routine. +*/ + +static int +dm_find_msg( + dm_session_t *s, + dm_token_t token, + dm_tokevent_t **tevpp) +{ + dm_tokevent_t *tevp; + + if (token <= DM_INVALID_TOKEN) + return(EINVAL); + + for (tevp = s->sn_delq.eq_head; tevp; tevp = tevp->te_next) { + if (tevp->te_msg.ev_token == token) { + *tevpp = tevp; + return(0); + } + } + return(ESRCH); +} + + +/* Given a session ID and token, find the tevp on the specified session's + sn_delq which corresponds to that session ID/token pair. If a match is + found, lock the tevp's te_lock and return a pointer to the tevp. + '*lcp' is used to obtain the tevp's te_lock. The caller is responsible + for eventually unlocking it. +*/ + +int +dm_find_msg_and_lock( + dm_sessid_t sid, + dm_token_t token, + dm_tokevent_t **tevpp, + unsigned long *lcp) /* address of returned lock cookie */ +{ + dm_session_t *s; + int error; + + if ((error = dm_find_session_and_lock(sid, &s, lcp)) != 0) + return(error); + + if ((error = dm_find_msg(s, token, tevpp)) != 0) { + mutex_spinunlock(&s->sn_qlock, *lcp); + return(error); + } + nested_spinlock(&(*tevpp)->te_lock); + nested_spinunlock(&s->sn_qlock); + return(0); +} + + +/* Create a new session, or resume an old session if one is given. */ + +int +dm_create_session( + dm_sessid_t old, + char *info, + dm_sessid_t *new) +{ + dm_session_t *s; + dm_sessid_t sid; + char sessinfo[DM_SESSION_INFO_LEN]; + size_t len; + int error; + unsigned long lc; /* lock cookie */ + + len = strnlen_user(info, DM_SESSION_INFO_LEN-1); + if (copy_from_user(sessinfo, info, len)) + return(EFAULT); + lc = mutex_spinlock(&dm_session_lock); + sid = dm_next_sessid++; + mutex_spinunlock(&dm_session_lock, lc); + if (copy_to_user(new, &sid, sizeof(sid))) + return(EFAULT); + + if (old == DM_NO_SESSION) { + s = kmem_cache_alloc(dm_session_cachep, SLAB_KERNEL); + if (s == NULL) { + printk("%s/%d: kmem_cache_alloc(dm_session_cachep) returned NULL\n", __FUNCTION__, __LINE__); + return ENOMEM; + } + memset(s, 0, sizeof(*s)); + + sv_init(&s->sn_readerq, SV_DEFAULT, "dmreadq"); + sv_init(&s->sn_writerq, SV_DEFAULT, "dmwritq"); + spinlock_init(&s->sn_qlock, "sn_qlock"); + lc = mutex_spinlock(&dm_session_lock); + } else { + lc = mutex_spinlock(&dm_session_lock); + if ((error = dm_find_session(old, &s)) != 0) { + mutex_spinunlock(&dm_session_lock, lc); + return(error); + } +#ifdef CONFIG_PROC_FS + { + char buf[100]; + sprintf(buf, DMAPI_DBG_PROCFS "/sessions/0x%p", s); + remove_proc_entry(buf, NULL); + } +#endif + unlink_session(s); + } + bcopy(sessinfo, s->sn_info, len); + s->sn_info[len-1] = 0; /* if not NULL, then now 'tis */ + s->sn_sessid = sid; + link_session(s); +#ifdef CONFIG_PROC_FS + { + char buf[100]; + struct proc_dir_entry *entry; + + sprintf(buf, DMAPI_DBG_PROCFS "/sessions/0x%p", s); + entry = create_proc_read_entry(buf, 0, 0, sessions_read_pfs, s); + entry->owner = THIS_MODULE; + } +#endif + mutex_spinunlock(&dm_session_lock, lc); + return(0); +} + + +int +dm_destroy_session( + dm_sessid_t sid) +{ + dm_session_t *s; + int error; + unsigned long lc; /* lock cookie */ + + /* The dm_session_lock must be held until the session is unlinked. */ + + lc = mutex_spinlock(&dm_session_lock); + + if ((error = dm_find_session(sid, &s)) != 0) { + mutex_spinunlock(&dm_session_lock, lc); + return(error); + } + nested_spinlock(&s->sn_qlock); + + /* The session exists. Check to see if it is still in use. If any + messages still exist on the sn_newq or sn_delq, or if any processes + are waiting for messages to arrive on the session, then the session + must not be destroyed. + */ + + if (s->sn_newq.eq_head || s->sn_readercnt || s->sn_delq.eq_head) { + nested_spinunlock(&s->sn_qlock); + mutex_spinunlock(&dm_session_lock, lc); + return(EBUSY); + } + +#ifdef CONFIG_PROC_FS + { + char buf[100]; + sprintf(buf, DMAPI_DBG_PROCFS "/sessions/0x%p", s); + remove_proc_entry(buf, NULL); + } +#endif + + /* The session is not in use. Dequeue it from the session chain. */ + + unlink_session(s); + nested_spinunlock(&s->sn_qlock); + mutex_spinunlock(&dm_session_lock, lc); + + /* Now clear the sessions's disposition registration, and then destroy + the session structure. + */ + + dm_clear_fsreg(s); + + spinlock_destroy(&s->sn_qlock); + sv_destroy(&s->sn_readerq); + sv_destroy(&s->sn_writerq); +#ifdef __sgi + if (s->sn_sesshash) + kmem_free(s->sn_sesshash, dm_hash_buckets * sizeof(dm_sesshash_t)); +#endif + kmem_cache_free(dm_session_cachep, s); + return(0); +} + + +/* + * Return a list of all active sessions. + */ + +int +dm_getall_sessions( + u_int nelem, + dm_sessid_t *sidp, + u_int *nelemp) +{ + dm_session_t *s; + u_int sesscnt; + dm_sessid_t *sesslist; + unsigned long lc; /* lock cookie */ + int error; + int i; + + /* Loop until we can get the right amount of temp space, being careful + not to hold a mutex during the allocation. Usually only one trip. + */ + + for (;;) { + if ((sesscnt = dm_sessions_active) == 0) { + /*if (suword(nelemp, 0))*/ + if (put_user(0, nelemp)) + return(EFAULT); + return(0); + } + sesslist = kmalloc(sesscnt * sizeof(*sidp), GFP_KERNEL); + if (sesslist == NULL) { + printk("%s/%d: kmalloc returned NULL\n", __FUNCTION__, __LINE__); + return ENOMEM; + } + + lc = mutex_spinlock(&dm_session_lock); + if (sesscnt == dm_sessions_active) + break; + + mutex_spinunlock(&dm_session_lock, lc); + kfree(sesslist); + } + + /* Make a temp copy of the data, then release the mutex. */ + + for (i = 0, s = dm_sessions; i < sesscnt; i++, s = s->sn_next) + sesslist[i] = s->sn_sessid; + + mutex_spinunlock(&dm_session_lock, lc); + + /* Now copy the data to the user. */ + + if(put_user(sesscnt, nelemp)) { + error = EFAULT; + } else if (sesscnt > nelem) { + error = E2BIG; + } else if (copy_to_user(sidp, sesslist, sesscnt * sizeof(*sidp))) { + error = EFAULT; + } else { + error = 0; + } + kfree(sesslist); + return(error); +} + + +/* + * Return the descriptive string associated with a session. + */ + +int +dm_query_session( + dm_sessid_t sid, + size_t buflen, + void *bufp, + size_t *rlenp) +{ + dm_session_t *s; /* pointer to session given by sid */ + int len; /* length of session info string */ + int error; + char sessinfo[DM_SESSION_INFO_LEN]; + unsigned long lc; /* lock cookie */ + + if ((error = dm_find_session_and_lock(sid, &s, &lc)) != 0) + return(error); + + len = strlen(s->sn_info) + 1; /* NULL terminated when created */ + bcopy(s->sn_info, sessinfo, len); + + mutex_spinunlock(&s->sn_qlock, lc); + + /* Now that the mutex is released, copy the sessinfo to the user. */ + + if (put_user(len, rlenp)) { + error = EFAULT; + } else if (len > buflen) { + error = E2BIG; + } else if (copy_to_user(bufp, sessinfo, len)) { + error = EFAULT; + } else { + error = 0; + } + return(error); +} + + +/* + * Return all of the previously delivered tokens (that is, their IDs) + * for the given session. + */ + +int +dm_getall_tokens( + dm_sessid_t sid, /* session obtaining tokens from */ + u_int nelem, /* size of tokenbufp */ + dm_token_t *tokenbufp, /* buffer to copy token IDs to */ + u_int *nelemp) /* return number copied to tokenbufp */ +{ + dm_session_t *s; /* pointer to session given by sid */ + dm_tokevent_t *tevp; /* event message queue traversal */ + unsigned long lc; /* lock cookie */ + int tokcnt; + dm_token_t *toklist; + int error; + int i; + + /* Loop until we can get the right amount of temp space, being careful + not to hold a mutex during the allocation. Usually only one trip. + */ + + for (;;) { + if ((error = dm_find_session_and_lock(sid, &s, &lc)) != 0) + return(error); + tokcnt = s->sn_delq.eq_count; + mutex_spinunlock(&s->sn_qlock, lc); + + if (tokcnt == 0) { + /*if (suword(nelemp, 0))*/ + if (put_user(0, nelemp)) + return(EFAULT); + return(0); + } + toklist = kmalloc(tokcnt * sizeof(*tokenbufp), GFP_KERNEL); + if (toklist == NULL) { + printk("%s/%d: kmalloc returned NULL\n", __FUNCTION__, __LINE__); + return ENOMEM; + } + + if ((error = dm_find_session_and_lock(sid, &s, &lc)) != 0) { + kfree(toklist); + return(error); + } + + if (tokcnt == s->sn_delq.eq_count) + break; + + mutex_spinunlock(&s->sn_qlock, lc); + kfree(toklist); + } + + /* Make a temp copy of the data, then release the mutex. */ + + tevp = s->sn_delq.eq_head; + for (i = 0; i < tokcnt; i++, tevp = tevp->te_next) + toklist[i] = tevp->te_msg.ev_token; + + mutex_spinunlock(&s->sn_qlock, lc); + + /* Now copy the data to the user. */ + + if (put_user(tokcnt, nelemp)) { + error = EFAULT; + } else if (tokcnt > nelem) { + error = E2BIG; + } else if (copy_to_user(tokenbufp,toklist,tokcnt*sizeof(*tokenbufp))) { + error = EFAULT; + } else { + error = 0; + } + kfree(toklist); + return(error); +} + + +/* + * Return the message identified by token. + */ + +int +dm_find_eventmsg( + dm_sessid_t sid, + dm_token_t token, + size_t buflen, + void *bufp, + size_t *rlenp) +{ + dm_tokevent_t *tevp; /* message identified by token */ + int msgsize; /* size of message to copy out */ + void *msg; + int error; + unsigned long lc; /* lock cookie */ + + /* Because some of the events (dm_data_event_t in particular) contain + __u64 fields, we need to make sure that the buffer provided by the + caller is aligned such that he can read those fields successfully. + */ + + if (((__psint_t)bufp & (sizeof(__u64) - 1)) != 0) + return(EFAULT); + + /* Allocate the right amount of temp space, being careful not to hold + a mutex during the allocation. + */ + + if ((error = dm_find_msg_and_lock(sid, token, &tevp, &lc)) != 0) + return(error); + msgsize = tevp->te_allocsize - offsetof(dm_tokevent_t, te_msg); + mutex_spinunlock(&tevp->te_lock, lc); + + msg = kmalloc(msgsize, GFP_KERNEL); + if (msg == NULL) { + printk("%s/%d: kmalloc returned NULL\n", __FUNCTION__, __LINE__); + return ENOMEM; + } + + if ((error = dm_find_msg_and_lock(sid, token, &tevp, &lc)) != 0) { + kfree(msg); + return(error); + } + + /* Make a temp copy of the data, then release the mutex. */ + + bcopy(&tevp->te_msg, msg, msgsize); + mutex_spinunlock(&tevp->te_lock, lc); + + /* Now copy the data to the user. */ + + if (put_user(msgsize,rlenp)) { + error = EFAULT; + } else if (msgsize > buflen) { /* user buffer not big enough */ + error = E2BIG; + } else if (copy_to_user( bufp, msg, msgsize )) { + error = EFAULT; + } else { + error = 0; + } + kfree(msg); + return(error); +} + + +int +dm_move_event( + dm_sessid_t srcsid, + dm_token_t token, + dm_sessid_t targetsid, + dm_token_t *rtokenp) +{ + dm_session_t *s1; + dm_session_t *s2; + dm_tokevent_t *tevp; + int error; + unsigned long lc; /* lock cookie */ +#ifdef __sgi + int hash_it; +#endif + + lc = mutex_spinlock(&dm_session_lock); + + if ((error = dm_find_session(srcsid, &s1)) != 0 || + (error = dm_find_session(targetsid, &s2)) != 0 || + (error = dm_find_msg(s1, token, &tevp)) != 0) { + mutex_spinunlock(&dm_session_lock, lc); + return(error); + } + dm_unlink_event(tevp, &s1->sn_delq); +#ifdef __sgi + if (tevp->te_flags & DM_TEF_HASHED) { + unhash_event(s1, tevp); + hash_it = 1; + } +#endif + dm_link_event(tevp, &s2->sn_delq); +#ifdef __sgi + if (hash_it) + hash_event(s2, tevp); +#endif + mutex_spinunlock(&dm_session_lock, lc); + + if (copy_to_user(rtokenp, &token, sizeof(token))) + return(EFAULT); + return(0); +} + + +/* ARGSUSED */ +int +dm_pending( + dm_sessid_t sid, + dm_token_t token, + dm_timestruct_t *delay) /* unused */ +{ + dm_tokevent_t *tevp; + int error; + unsigned long lc; /* lock cookie */ + + if ((error = dm_find_msg_and_lock(sid, token, &tevp, &lc)) != 0) + return(error); + + tevp->te_flags |= DM_TEF_INTERMED; + if (tevp->te_evt_ref > 0) /* if event generation threads exist */ + sv_broadcast(&tevp->te_evt_queue); + + mutex_spinunlock(&tevp->te_lock, lc); + return(0); +} + + +int +dm_get_events( + dm_sessid_t sid, + u_int maxmsgs, + u_int flags, + size_t buflen, + void *bufp, + size_t *rlenp) +{ + dm_session_t *s; /* pointer to session given by sid */ + dm_tokevent_t *tevp; /* next event message on queue */ + int error; + unsigned long lc1; /* first lock cookie */ + unsigned long lc2 = 0; /* second lock cookie */ + int totalsize; + int msgsize; + dm_eventmsg_t *prevmsg; + int prev_msgsize = 0; + u_int msgcnt; + + /* Because some of the events (dm_data_event_t in particular) contain + __u64 fields, we need to make sure that the buffer provided by the + caller is aligned such that he can read those fields successfully. + */ + + if (((__psint_t)bufp & (sizeof(__u64) - 1)) != 0) + return(EFAULT); + + /* Find the indicated session and lock it. */ + + if ((error = dm_find_session_and_lock(sid, &s, &lc1)) != 0) + return(error); + + /* Check for messages on sn_newq. If there aren't any that haven't + already been grabbed by another process, and if we are supposed to + to wait until one shows up, then go to sleep interruptibly on the + sn_readerq semaphore. The session can't disappear out from under + us as long as sn_readerq is non-zero. + */ + + for (;;) { + int rc; + + for (tevp = s->sn_newq.eq_head; tevp; tevp = tevp->te_next) { + lc2 = mutex_spinlock(&tevp->te_lock); + if (!(tevp->te_flags & DM_TEF_LOCKED)) + break; + mutex_spinunlock(&tevp->te_lock, lc2); + } + if (tevp) + break; /* got one! */ + + if (!(flags & DM_EV_WAIT)) { + mutex_spinunlock(&s->sn_qlock, lc1); + return(EAGAIN); + } + s->sn_readercnt++; + + sv_wait_sig(&s->sn_readerq, 1, &s->sn_qlock, lc1); + rc = signal_pending(current); + + lc1 = mutex_spinlock(&s->sn_qlock); + s->sn_readercnt--; + if (rc) { /* if signal was received */ + mutex_spinunlock(&s->sn_qlock, lc1); + return(EINTR); + } + } + + /* At least one message is available for delivery, and we have both the + session lock and event lock. Mark the event so that it is not + grabbed by other daemons, then drop both locks prior copying the + data to the caller's buffer. Leaving the event on the queue in a + marked state prevents both the session and the event from + disappearing out from under us while we don't have the locks. + */ + + tevp->te_flags |= DM_TEF_LOCKED; + mutex_spinunlock(&tevp->te_lock, lc2); /* reverse cookie order */ + mutex_spinunlock(&s->sn_qlock, lc1); + + /* Continue to deliver messages until there are no more, the + user's buffer becomes full, or we hit his maxmsgs limit. + */ + + totalsize = 0; /* total bytes transferred to the user */ + prevmsg = NULL; + msgcnt = 0; + + while (tevp) { + /* Compute the number of bytes to be moved, rounding up to an + 8-byte boundary so that any subsequent messages will also be + aligned. + */ + + msgsize = tevp->te_allocsize - offsetof(dm_tokevent_t, te_msg); + msgsize = (msgsize + sizeof(__u64) - 1) & ~(sizeof(__u64) - 1); + totalsize += msgsize; + + /* If it fits, copy the message into the user's buffer and + update his 'rlenp'. Update the _link pointer for any + previous message. + */ + + if (totalsize > buflen) { /* no more room */ + error = E2BIG; + } else if (put_user(totalsize, rlenp)) { + error = EFAULT; + } else if (copy_to_user(bufp, &tevp->te_msg, msgsize)) { + error = EFAULT; + } else if (prevmsg && put_user(prev_msgsize, &prevmsg->_link)) { + error = EFAULT; + } else { + error = 0; + } + + /* If an error occurred, just unmark the event and leave it on + the queue for someone else. Note that other daemons may + have gone to sleep because this event was marked, so wake + them up. Also, if at least one message has already been + delivered, then an error here is not really an error. + */ + + lc1 = mutex_spinlock(&s->sn_qlock); + lc2 = mutex_spinlock(&tevp->te_lock); + tevp->te_flags &= ~DM_TEF_LOCKED; /* drop the mark */ + + if (error) { + if (s->sn_readercnt) + sv_signal(&s->sn_readerq); + + mutex_spinunlock(&tevp->te_lock, lc2); /* rev. order */ + mutex_spinunlock(&s->sn_qlock, lc1); + if (prevmsg) + return(0); + if (error == E2BIG && put_user(totalsize,rlenp)) + error = EFAULT; + return(error); + } + + /* The message was successfully delivered. Unqueue it. */ + + dm_unlink_event(tevp, &s->sn_newq); + + /* Wake up the first of any processes waiting for room on the + sn_newq. + */ + + if (s->sn_writercnt) + sv_signal(&s->sn_writerq); + + /* If the message is synchronous, add it to the sn_delq while + still holding the lock. If it is asynchronous, free it. + */ + + if (tevp->te_msg.ev_token != DM_INVALID_TOKEN) { /* synch */ + dm_link_event(tevp, &s->sn_delq); + mutex_spinunlock(&tevp->te_lock, lc2); + } else { + tevp->te_flags |= DM_TEF_FINAL; +#ifdef __sgi + if (tevp->te_flags & DM_TEF_HASHED) + unhash_event(s, tevp); +#endif + mutex_spinunlock(&tevp->te_lock, lc2); + dm_put_tevp(tevp, NULL);/* can't cause destroy events */ + } + + /* Update our notion of where we are in the user's buffer. If + he doesn't want any more messages, then stop. + */ + + prevmsg = (dm_eventmsg_t *)bufp; + prev_msgsize = msgsize; + bufp = (char *)bufp + msgsize; + + msgcnt++; + if (maxmsgs && msgcnt >= maxmsgs) { + mutex_spinunlock(&s->sn_qlock, lc1); + break; + } + + /* While still holding the sn_qlock, see if any additional + messages are available for delivery. + */ + + for (tevp = s->sn_newq.eq_head; tevp; tevp = tevp->te_next) { + lc2 = mutex_spinlock(&tevp->te_lock); + if (!(tevp->te_flags & DM_TEF_LOCKED)) { + tevp->te_flags |= DM_TEF_LOCKED; + mutex_spinunlock(&tevp->te_lock, lc2); + break; + } + mutex_spinunlock(&tevp->te_lock, lc2); + } + mutex_spinunlock(&s->sn_qlock, lc1); + } + return(0); +} + + +/* + * Remove an event message from the delivered queue, set the returned + * error where the event generator wants it, and wake up the generator. + * Also currently have the user side release any locks it holds... + */ + +/* ARGSUSED */ +int +dm_respond_event( + dm_sessid_t sid, + dm_token_t token, + dm_response_t response, + int reterror, + size_t buflen, /* unused */ + void *respbufp) /* unused */ +{ + dm_session_t *s; /* pointer to session given by sid */ + dm_tokevent_t *tevp; /* event message queue traversal */ + int error; + unsigned long lc; /* lock cookie */ + + /* Sanity check the input parameters. */ + + switch (response) { + case DM_RESP_CONTINUE: /* continue must have reterror == 0 */ + if (reterror != 0) + return(EINVAL); + break; + case DM_RESP_ABORT: /* abort must have errno set */ + if (reterror <= 0) + return(EINVAL); + break; + case DM_RESP_DONTCARE: + if (reterror > 0) + return(EINVAL); + reterror = -1; /* to distinguish DM_RESP_DONTCARE */ + break; + default: + return(EINVAL); + } + + /* Hold session lock until the event is unqueued. */ + + if ((error = dm_find_session_and_lock(sid, &s, &lc)) != 0) + return(error); + + if ((error = dm_find_msg(s, token, &tevp)) != 0) { + mutex_spinunlock(&s->sn_qlock, lc); + return(error); + } + nested_spinlock(&tevp->te_lock); + + if (reterror == -1 && tevp->te_msg.ev_type != DM_EVENT_MOUNT) { + error = EINVAL; + nested_spinunlock(&tevp->te_lock); + mutex_spinunlock(&s->sn_qlock, lc); + } else { + dm_unlink_event(tevp, &s->sn_delq); +#ifdef __sgi + if (tevp->te_flags & DM_TEF_HASHED) + unhash_event(s, tevp); +#endif + tevp->te_reply = reterror; + tevp->te_flags |= DM_TEF_FINAL; + if (tevp->te_evt_ref) + sv_broadcast(&tevp->te_evt_queue); + nested_spinunlock(&tevp->te_lock); + mutex_spinunlock(&s->sn_qlock, lc); + error = 0; + + /* Absolutely no locks can be held when calling dm_put_tevp! */ + + dm_put_tevp(tevp, NULL); /* this can generate destroy events */ + } + return(error); +} + + +/* Queue the filled in event message pointed to by tevp on the session s, and + (if a synchronous event) wait for the reply from the DMAPI application. + The caller MUST be holding the session lock before calling this routine! + The session lock is always released upon exit. + Returns: + -1 == don't care + 0 == success (or async event) + > 0 == errno describing reason for failure +*/ + +static int +dm_enqueue( + dm_session_t *s, + unsigned long lc, /* input lock cookie */ + dm_tokevent_t *tevp, /* in/out parameter */ + int sync, + int flags, + int interruptable) +{ + int is_unmount = 0; +#ifdef __sgi + int is_hashable = 0; +#endif + int reply; + +#ifdef __sgi + /* If the caller isn't planning to stick around for the result + and this request is identical to one that is already on the + queues then just give the caller an EAGAIN. Release the + session lock before returning. + + We look only at NDELAY requests with an event type of READ, + WRITE, or TRUNCATE on objects that are regular files. + */ + + if ((flags & DM_FLAGS_NDELAY) && DM_EVENT_RDWRTRUNC(tevp) && + (tevp->te_tdp->td_type == DM_TDT_REG)) { + if (repeated_event(s, tevp)) { + mutex_spinunlock(&s->sn_qlock, lc); + return(EAGAIN); + } + is_hashable = 1; + } +#endif + + if (tevp->te_msg.ev_type == DM_EVENT_UNMOUNT) + is_unmount = 1; + + /* Check for room on sn_newq. If there is no room for new messages, + then go to sleep on the sn_writerq semaphore. The + session cannot disappear out from under us as long as sn_writercnt + is non-zero. + */ + + while (s->sn_newq.eq_count >= dm_max_queued_msgs) { /* no room */ + s->sn_writercnt++; + dm_link_event(tevp, &s->sn_evt_writerq); + if (interruptable) { + sv_wait_sig(&s->sn_writerq, 1, &s->sn_qlock, lc); + if (signal_pending(current)) { + s->sn_writercnt--; + return(EINTR); + } + } else { + sv_wait(&s->sn_writerq, 1, &s->sn_qlock, lc); + } + lc = mutex_spinlock(&s->sn_qlock); + s->sn_writercnt--; + dm_unlink_event(tevp, &s->sn_evt_writerq); + } + + /* Assign a sequence number and token to the event and bump the + application reference count by one. We don't need 'te_lock' here + because this thread is still the only thread that can see the event. + */ + + nested_spinlock(&dm_token_lock); + tevp->te_msg.ev_sequence = dm_next_sequence++; + if (sync) { + tevp->te_msg.ev_token = dm_next_token++; + } else { + tevp->te_msg.ev_token = DM_INVALID_TOKEN; + } + nested_spinunlock(&dm_token_lock); + + tevp->te_app_ref++; + + /* Room exists on the sn_newq queue, so add this request. If the + queue was previously empty, wake up the first of any processes + that are waiting for an event. + */ + + dm_link_event(tevp, &s->sn_newq); +#ifdef __sgi + if (is_hashable) + hash_event(s, tevp); +#endif + + if (s->sn_readercnt) + sv_signal(&s->sn_readerq); + + mutex_spinunlock(&s->sn_qlock, lc); + + /* Now that the message is queued, processes issuing asynchronous + events or DM_EVENT_UNMOUNT events are ready to continue. + */ + + if (!sync || is_unmount) + return(0); + + /* Synchronous requests wait until a final reply is received. If the + caller supplied the DM_FLAGS_NDELAY flag, the process will return + EAGAIN if dm_pending() sets DM_TEF_INTERMED. We also let users + Cntl-C out of a read, write, and truncate requests. + */ + + lc = mutex_spinlock(&tevp->te_lock); + + while (!(tevp->te_flags & DM_TEF_FINAL)) { + if ((tevp->te_flags & DM_TEF_INTERMED) && + (flags & DM_FLAGS_NDELAY)) { + mutex_spinunlock(&tevp->te_lock, lc); + return(EAGAIN); + } + if (tevp->te_msg.ev_type == DM_EVENT_READ || + tevp->te_msg.ev_type == DM_EVENT_WRITE || + tevp->te_msg.ev_type == DM_EVENT_TRUNCATE) { + sv_wait_sig(&tevp->te_evt_queue, 1, &tevp->te_lock, lc); + if (signal_pending(current)){ + return(EINTR); + } + } else { + sv_wait(&tevp->te_evt_queue, 1, &tevp->te_lock, lc); + } + lc = mutex_spinlock(&tevp->te_lock); + } + + /* Return both the tevp and the reply which was stored in the tevp by + dm_respond_event. The tevp structure has already been removed from + the reply queue by this point in dm_respond_event(). + */ + + reply = tevp->te_reply; + mutex_spinunlock(&tevp->te_lock, lc); + return(reply); +} + + +/* The filesystem is guaranteed to stay mounted while this event is + outstanding. +*/ + +int +dm_enqueue_normal_event( + vfs_t *vfsp, + dm_tokevent_t *tevp, + int flags) +{ + dm_session_t *s; + int error; + int sync; + unsigned long lc; /* lock cookie */ + + switch (tevp->te_msg.ev_type) { + case DM_EVENT_READ: + case DM_EVENT_WRITE: + case DM_EVENT_TRUNCATE: + case DM_EVENT_PREUNMOUNT: + case DM_EVENT_UNMOUNT: + case DM_EVENT_NOSPACE: + case DM_EVENT_CREATE: + case DM_EVENT_REMOVE: + case DM_EVENT_RENAME: + case DM_EVENT_SYMLINK: + case DM_EVENT_LINK: + case DM_EVENT_DEBUT: /* not currently supported */ + sync = 1; + break; + + case DM_EVENT_DESTROY: + case DM_EVENT_POSTCREATE: + case DM_EVENT_POSTREMOVE: + case DM_EVENT_POSTRENAME: + case DM_EVENT_POSTSYMLINK: + case DM_EVENT_POSTLINK: + case DM_EVENT_ATTRIBUTE: + case DM_EVENT_CLOSE: /* not currently supported */ + case DM_EVENT_CANCEL: /* not currently supported */ + sync = 0; + break; + + default: + return(EIO); /* garbage event number */ + } + + /* Wait until a session selects disposition for the event. The session + is locked upon return from dm_waitfor_disp_session(). + */ + + if ((error = dm_waitfor_disp_session(vfsp, tevp, &s, &lc)) != 0) + return(error); + + return(dm_enqueue(s, lc, tevp, sync, flags, 0)); +} + + +/* Traverse the session list checking for sessions with the WANTMOUNT flag + set. When one is found, send it the message. Possible responses to the + message are one of DONTCARE, CONTINUE, or ABORT. The action taken in each + case is: + DONTCARE (-1) - Send the event to the next session with WANTMOUNT set + CONTINUE ( 0) - Proceed with the mount, errno zero. + ABORT (>0) - Fail the mount, return the returned errno. + + The mount request is sent to sessions in ascending session ID order. + Since the session list can change dramatically while this process is + sleeping in dm_enqueue(), this routine must use session IDs rather than + session pointers when keeping track of where it is in the list. Since + new sessions are always added at the end of the queue, and have increasing + session ID values, we don't have to worry about missing any session. +*/ + +int +dm_enqueue_mount_event( + vfs_t *vfsp, + dm_tokevent_t *tevp) +{ + dm_session_t *s; + dm_sessid_t sid; + int error; + unsigned long lc; /* lock cookie */ + + /* Make the mounting filesystem visible to other DMAPI calls. */ + + if ((error = dm_add_fsys_entry(vfsp, tevp)) != 0){ + return(error); + } + + /* Walk through the session list presenting the mount event to each + session that is interested until a session accepts or rejects it, + or until all sessions ignore it. + */ + + for (sid = DM_NO_SESSION, error = -1; error < 0; sid = s->sn_sessid) { + + lc = mutex_spinlock(&dm_session_lock); + for (s = dm_sessions; s; s = s->sn_next) { + if (s->sn_sessid > sid && s->sn_flags & DM_SN_WANTMOUNT) { + nested_spinlock(&s->sn_qlock); + nested_spinunlock(&dm_session_lock); + break; + } + } + if (s == NULL) { + mutex_spinunlock(&dm_session_lock, lc); + break; /* noone wants it; proceed with mount */ + } + error = dm_enqueue(s, lc, tevp, 1, 0, 0); + } + + /* If the mount will be allowed to complete, then update the fsrp entry + accordingly. If the mount is to be aborted, remove the fsrp entry. + */ + + if (error <= 0) { + dm_change_fsys_entry(vfsp, DM_STATE_MOUNTED); + error = 0; + } else { + dm_remove_fsys_entry(vfsp); + } + return(error); +} + +int +dm_enqueue_sendmsg_event( + dm_sessid_t targetsid, + dm_tokevent_t *tevp, + int sync) +{ + dm_session_t *s; + int error; + unsigned long lc; /* lock cookie */ + + if ((error = dm_find_session_and_lock(targetsid, &s, &lc)) != 0) + return(error); + + return(dm_enqueue(s, lc, tevp, sync, 0, 1)); +} + + +dm_token_t +dm_enqueue_user_event( + dm_sessid_t sid, + dm_tokevent_t *tevp, + dm_token_t *tokenp) +{ + dm_session_t *s; + int error; + unsigned long lc; /* lock cookie */ + + /* Atomically find and lock the session whose session id is 'sid'. */ + + if ((error = dm_find_session_and_lock(sid, &s, &lc)) != 0) + return(error); + + /* Assign a sequence number and token to the event, bump the + application reference count by one, and decrement the event + count because the caller gives up all ownership of the event. + We don't need 'te_lock' here because this thread is still the + only thread that can see the event. + */ + + nested_spinlock(&dm_token_lock); + tevp->te_msg.ev_sequence = dm_next_sequence++; + *tokenp = tevp->te_msg.ev_token = dm_next_token++; + nested_spinunlock(&dm_token_lock); + + tevp->te_flags &= ~(DM_TEF_INTERMED|DM_TEF_FINAL); + tevp->te_app_ref++; + tevp->te_evt_ref--; + + /* Add the request to the tail of the sn_delq. Now it's visible. */ + + dm_link_event(tevp, &s->sn_delq); + mutex_spinunlock(&s->sn_qlock, lc); + + return(0); +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/dmapi/dmapi_sysent.c linux-2.4-xfs/fs/xfs/dmapi/dmapi_sysent.c --- linux-2.4.19/fs/xfs/dmapi/dmapi_sysent.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/dmapi/dmapi_sysent.c Fri Aug 30 11:08:17 2002 @@ -0,0 +1,758 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* Data Migration API (DMAPI) + */ + + +/* We're using MISC_MAJOR / DMAPI_MINOR. */ + +#include +#include +#include +#include +#include +#include + +#include + +#include "dmapi_private.h" + +kmem_cache_t *dm_fsreg_cachep = NULL; +kmem_cache_t *dm_tokdata_cachep = NULL; +kmem_cache_t *dm_session_cachep = NULL; + +static int +dmapi_ioctl(struct inode *inode, struct file *file, unsigned int cmd, + unsigned long arg) +{ + sys_dmapi_args_t kargs; + sys_dmapi_args_t *uap = &kargs; + int error = 0; + int rvp = -ENOSYS; + int use_rvp = 0; + + if (!capable(CAP_MKNOD)) + return(-EPERM); + + if( copy_from_user( &kargs, (sys_dmapi_args_t*)arg, + sizeof(sys_dmapi_args_t) ) ) + return -EFAULT; + + switch (cmd) { + case DM_CLEAR_INHERIT: + error = dm_clear_inherit( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (dm_attrname_t *) DM_Parg(uap,5));/* attrnamep */ + break; + case DM_CREATE_BY_HANDLE: + error = dm_create_by_handle( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* dirhanp */ + (size_t) DM_Uarg(uap,3), /* dirhlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (void *) DM_Parg(uap,5), /* hanp */ + (size_t) DM_Uarg(uap,6), /* hlen */ + (char *) DM_Parg(uap,7));/* cname */ + break; + case DM_CREATE_SESSION: + error = dm_create_session( + (dm_sessid_t) DM_Uarg(uap,1), /* oldsid */ + (char *) DM_Parg(uap,2), /* sessinfop */ + (dm_sessid_t *) DM_Parg(uap,3));/* newsidp */ + break; + case DM_CREATE_USEREVENT: + error = dm_create_userevent( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (size_t) DM_Uarg(uap,2), /* msglen */ + (void *) DM_Parg(uap,3), /* msgdatap */ + (dm_token_t *) DM_Parg(uap,4));/* tokenp */ + break; + case DM_DESTROY_SESSION: + error = dm_destroy_session( + (dm_sessid_t) DM_Uarg(uap,1));/* sid */ + break; + case DM_DOWNGRADE_RIGHT: + error = dm_downgrade_right( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4));/* token */ + break; + case DM_FD_TO_HANDLE: + error = dm_fd_to_hdl( + (int) DM_Uarg(uap,1), /* fd */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t *) DM_Parg(uap,3));/* hlenp */ + break; + case DM_FIND_EVENTMSG: + error = dm_find_eventmsg( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (dm_token_t) DM_Uarg(uap,2), /* token */ + (size_t) DM_Uarg(uap,3), /* buflen */ + (void *) DM_Parg(uap,4), /* bufp */ + (size_t *) DM_Parg(uap,5));/* rlenp */ + break; + case DM_GET_ALLOCINFO: + use_rvp = 1; + error = dm_get_allocinfo_rvp( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (dm_off_t *) DM_Parg(uap,5), /* offp */ + (u_int) DM_Uarg(uap,6), /* nelem */ + (dm_extent_t *) DM_Parg(uap,7), /* extentp */ + (u_int *) DM_Parg(uap,8), /* nelemp */ + &rvp); + break; + case DM_GET_BULKALL: + use_rvp = 1; + error = dm_get_bulkall_rvp( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (u_int) DM_Uarg(uap,5), /* mask */ + (dm_attrname_t *) DM_Parg(uap,6),/* attrnamep */ + (dm_attrloc_t *) DM_Parg(uap,7),/* locp */ + (size_t) DM_Uarg(uap,8), /* buflen */ + (void *) DM_Parg(uap,9), /* bufp */ + (size_t *) DM_Parg(uap,10),/* rlenp */ + &rvp); + break; + case DM_GET_BULKATTR: + use_rvp = 1; + error = dm_get_bulkattr_rvp( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (u_int) DM_Uarg(uap,5), /* mask */ + (dm_attrloc_t *)DM_Parg(uap,6), /* locp */ + (size_t) DM_Uarg(uap,7), /* buflen */ + (void *) DM_Parg(uap,8), /* bufp */ + (size_t *) DM_Parg(uap,9), /* rlenp */ + &rvp); + break; + case DM_GET_CONFIG: + error = dm_get_config( + (void *) DM_Parg(uap,1), /* hanp */ + (size_t) DM_Uarg(uap,2), /* hlen */ + (dm_config_t) DM_Uarg(uap,3), /* flagname */ + (dm_size_t *) DM_Parg(uap,4));/* retvalp */ + break; + case DM_GET_CONFIG_EVENTS: + error = dm_get_config_events( + (void *) DM_Parg(uap,1), /* hanp */ + (size_t) DM_Uarg(uap,2), /* hlen */ + (u_int) DM_Uarg(uap,3), /* nelem */ + (dm_eventset_t *) DM_Parg(uap,4),/* eventsetp */ + (u_int *) DM_Parg(uap,5));/* nelemp */ + break; + case DM_GET_DIOINFO: + error = dm_get_dioinfo( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (dm_dioinfo_t *)DM_Parg(uap,5));/* diop */ + break; + case DM_GET_DIRATTRS: + use_rvp = 1; + error = dm_get_dirattrs_rvp( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (u_int) DM_Uarg(uap,5), /* mask */ + (dm_attrloc_t *)DM_Parg(uap,6), /* locp */ + (size_t) DM_Uarg(uap,7), /* buflen */ + (void *) DM_Parg(uap,8), /* bufp */ + (size_t *) DM_Parg(uap,9), /* rlenp */ + &rvp); + break; + case DM_GET_DMATTR: + error = dm_get_dmattr( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (dm_attrname_t *) DM_Parg(uap,5),/* attrnamep */ + (size_t) DM_Uarg(uap,6), /* buflen */ + (void *) DM_Parg(uap,7), /* bufp */ + (size_t *) DM_Parg(uap,8));/* rlenp */ + + break; + case DM_GET_EVENTLIST: + error = dm_get_eventlist( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (u_int) DM_Uarg(uap,5), /* nelem */ + (dm_eventset_t *) DM_Parg(uap,6),/* eventsetp */ + (u_int *) DM_Parg(uap,7));/* nelemp */ + break; + case DM_GET_EVENTS: + error = dm_get_events( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (u_int) DM_Uarg(uap,2), /* maxmsgs */ + (u_int) DM_Uarg(uap,3), /* flags */ + (size_t) DM_Uarg(uap,4), /* buflen */ + (void *) DM_Parg(uap,5), /* bufp */ + (size_t *) DM_Parg(uap,6));/* rlenp */ + break; + case DM_GET_FILEATTR: + error = dm_get_fileattr( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (u_int) DM_Uarg(uap,5), /* mask */ + (dm_stat_t *) DM_Parg(uap,6));/* statp */ + break; + case DM_GET_MOUNTINFO: + error = dm_get_mountinfo( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (size_t) DM_Uarg(uap,5), /* buflen */ + (void *) DM_Parg(uap,6), /* bufp */ + (size_t *) DM_Parg(uap,7));/* rlenp */ + break; + case DM_GET_REGION: + error = dm_get_region( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (u_int) DM_Uarg(uap,5), /* nelem */ + (dm_region_t *) DM_Parg(uap,6), /* regbufp */ + (u_int *) DM_Parg(uap,7));/* nelemp */ + break; + case DM_GETALL_DISP: + error = dm_getall_disp( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (size_t) DM_Uarg(uap,2), /* buflen */ + (void *) DM_Parg(uap,3), /* bufp */ + (size_t *) DM_Parg(uap,4));/* rlenp */ + break; + case DM_GETALL_DMATTR: + error = dm_getall_dmattr( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (size_t) DM_Uarg(uap,5), /* buflen */ + (void *) DM_Parg(uap,6), /* bufp */ + (size_t *) DM_Parg(uap,7));/* rlenp */ + break; + case DM_GETALL_INHERIT: + error = dm_getall_inherit( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (u_int) DM_Uarg(uap,5), /* nelem */ + (dm_inherit_t *)DM_Parg(uap,6), /* inheritbufp*/ + (u_int *) DM_Parg(uap,7));/* nelemp */ + break; + case DM_GETALL_SESSIONS: + error = dm_getall_sessions( + (u_int) DM_Uarg(uap,1), /* nelem */ + (dm_sessid_t *) DM_Parg(uap,2), /* sidbufp */ + (u_int *) DM_Parg(uap,3));/* nelemp */ + break; + case DM_GETALL_TOKENS: + error = dm_getall_tokens( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (u_int) DM_Uarg(uap,2), /* nelem */ + (dm_token_t *) DM_Parg(uap,3), /* tokenbufp */ + (u_int *) DM_Parg(uap,4));/* nelemp */ + break; + case DM_INIT_ATTRLOC: + error = dm_init_attrloc( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (dm_attrloc_t *) DM_Parg(uap,5));/* locp */ + break; + case DM_MKDIR_BY_HANDLE: + error = dm_mkdir_by_handle( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* dirhanp */ + (size_t) DM_Uarg(uap,3), /* dirhlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (void *) DM_Parg(uap,5), /* hanp */ + (size_t) DM_Uarg(uap,6), /* hlen */ + (char *) DM_Parg(uap,7));/* cname */ + break; + case DM_MOVE_EVENT: + error = dm_move_event( + (dm_sessid_t) DM_Uarg(uap,1), /* srcsid */ + (dm_token_t) DM_Uarg(uap,2), /* token */ + (dm_sessid_t) DM_Uarg(uap,3), /* targetsid */ + (dm_token_t *) DM_Parg(uap,4));/* rtokenp */ + break; + case DM_OBJ_REF_HOLD: + error = dm_obj_ref_hold( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (dm_token_t) DM_Uarg(uap,2), /* token */ + (void *) DM_Parg(uap,3), /* hanp */ + (size_t) DM_Uarg(uap,4));/* hlen */ + break; + case DM_OBJ_REF_QUERY: + use_rvp = 1; + error = dm_obj_ref_query_rvp( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (dm_token_t) DM_Uarg(uap,2), /* token */ + (void *) DM_Parg(uap,3), /* hanp */ + (size_t) DM_Uarg(uap,4), /* hlen */ + &rvp); + break; + case DM_OBJ_REF_RELE: + error = dm_obj_ref_rele( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (dm_token_t) DM_Uarg(uap,2), /* token */ + (void *) DM_Parg(uap,3), /* hanp */ + (size_t) DM_Uarg(uap,4));/* hlen */ + break; + case DM_PATH_TO_FSHANDLE: + error = dm_path_to_fshdl( + (char *) DM_Parg(uap,1), /* path */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t *) DM_Parg(uap,3));/* hlenp */ + break; + case DM_PATH_TO_HANDLE: + error = dm_path_to_hdl( + (char *) DM_Parg(uap,1), /* path */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t *) DM_Parg(uap,3));/* hlenp */ + break; + case DM_PENDING: + error = dm_pending( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (dm_token_t) DM_Uarg(uap,2), /* token */ + (dm_timestruct_t *) DM_Parg(uap,3));/* delay */ + break; + case DM_PROBE_HOLE: + error = dm_probe_hole( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (dm_off_t) DM_Uarg(uap,5), /* off */ + (dm_size_t) DM_Uarg(uap,6), /* len */ + (dm_off_t *) DM_Parg(uap,7), /* roffp */ + (dm_size_t *) DM_Parg(uap,8));/* rlenp */ + break; + case DM_PUNCH_HOLE: + error = dm_punch_hole( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (dm_off_t) DM_Uarg(uap,5), /* off */ + (dm_size_t) DM_Uarg(uap,6));/* len */ + break; + case DM_QUERY_RIGHT: + error = dm_query_right( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (dm_right_t *) DM_Parg(uap,5));/* rightp */ + break; + case DM_QUERY_SESSION: + error = dm_query_session( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (size_t) DM_Uarg(uap,2), /* buflen */ + (void *) DM_Parg(uap,3), /* bufp */ + (size_t *) DM_Parg(uap,4));/* rlenp */ + break; + case DM_READ_INVIS: + use_rvp = 1; + error = dm_read_invis_rvp( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (dm_off_t) DM_Uarg(uap,5), /* off */ + (dm_size_t) DM_Uarg(uap,6), /* len */ + (void *) DM_Parg(uap,7), /* bufp */ + &rvp); + break; + case DM_RELEASE_RIGHT: + error = dm_release_right( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4));/* token */ + break; + case DM_REMOVE_DMATTR: + error = dm_remove_dmattr( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (int) DM_Uarg(uap,5), /* setdtime */ + (dm_attrname_t *) DM_Parg(uap,6));/* attrnamep */ + break; + case DM_REQUEST_RIGHT: + error = dm_request_right( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (u_int) DM_Uarg(uap,5), /* flags */ + (dm_right_t) DM_Uarg(uap,6));/* right */ + break; + case DM_RESPOND_EVENT: + error = dm_respond_event( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (dm_token_t) DM_Uarg(uap,2), /* token */ + (dm_response_t) DM_Uarg(uap,3), /* response */ + (int) DM_Uarg(uap,4), /* reterror */ + (size_t) DM_Uarg(uap,5), /* buflen */ + (void *) DM_Parg(uap,6));/* respbufp */ + break; + case DM_SEND_MSG: + error = dm_send_msg( + (dm_sessid_t) DM_Uarg(uap,1), /* targetsid */ + (dm_msgtype_t) DM_Uarg(uap,2), /* msgtype */ + (size_t) DM_Uarg(uap,3), /* buflen */ + (void *) DM_Parg(uap,4));/* bufp */ + break; + case DM_SET_DISP: + error = dm_set_disp( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (dm_eventset_t *) DM_Parg(uap,5),/* eventsetp */ + (u_int) DM_Uarg(uap,6));/* maxevent */ + break; + case DM_SET_DMATTR: + error = dm_set_dmattr( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (dm_attrname_t *) DM_Parg(uap,5),/* attrnamep */ + (int) DM_Uarg(uap,6), /* setdtime */ + (size_t) DM_Uarg(uap,7), /* buflen */ + (void *) DM_Parg(uap,8));/* bufp */ + break; + case DM_SET_EVENTLIST: + error = dm_set_eventlist( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (dm_eventset_t *) DM_Parg(uap,5),/* eventsetp */ + (u_int) DM_Uarg(uap,6));/* maxevent */ + break; + case DM_SET_FILEATTR: + error = dm_set_fileattr( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (u_int) DM_Uarg(uap,5), /* mask */ + (dm_fileattr_t *)DM_Parg(uap,6));/* attrp */ + break; + case DM_SET_INHERIT: + error = dm_set_inherit( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (dm_attrname_t *)DM_Parg(uap,5),/* attrnamep */ + (mode_t) DM_Uarg(uap,6));/* mode */ + break; + case DM_SET_REGION: + error = dm_set_region( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (u_int) DM_Uarg(uap,5), /* nelem */ + (dm_region_t *) DM_Parg(uap,6), /* regbufp */ + (dm_boolean_t *) DM_Parg(uap,7));/* exactflagp */ + break; + case DM_SET_RETURN_ON_DESTROY: + error = dm_set_return_on_destroy( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (dm_attrname_t *) DM_Parg(uap,5),/* attrnamep */ + (dm_boolean_t) DM_Uarg(uap,6));/* enable */ + break; + case DM_SYMLINK_BY_HANDLE: + error = dm_symlink_by_handle( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* dirhanp */ + (size_t) DM_Uarg(uap,3), /* dirhlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (void *) DM_Parg(uap,5), /* hanp */ + (size_t) DM_Uarg(uap,6), /* hlen */ + (char *) DM_Parg(uap,7), /* cname */ + (char *) DM_Parg(uap,8));/* path */ + break; + case DM_SYNC_BY_HANDLE: + error = dm_sync_by_handle( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4));/* token */ + break; + case DM_UPGRADE_RIGHT: + error = dm_upgrade_right( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4));/* token */ + break; + case DM_WRITE_INVIS: + use_rvp = 1; + error = dm_write_invis_rvp( + (dm_sessid_t) DM_Uarg(uap,1), /* sid */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (dm_token_t) DM_Uarg(uap,4), /* token */ + (int) DM_Uarg(uap,5), /* flags */ + (dm_off_t) DM_Uarg(uap,6), /* off */ + (dm_size_t) DM_Uarg(uap,7), /* len */ + (void *) DM_Parg(uap,8), /* bufp */ + &rvp); + break; + case DM_OPEN_BY_HANDLE: + use_rvp = 1; + error = dm_open_by_handle_rvp( + (unsigned int) DM_Uarg(uap,1), /* fd */ + (void *) DM_Parg(uap,2), /* hanp */ + (size_t) DM_Uarg(uap,3), /* hlen */ + (int) DM_Uarg(uap,4), /* flags */ + &rvp); + break; + default: + error = ENOSYS; + break; + } + /* If it was an *_rvp() function, then + if error==0, return |rvp| + */ + if( use_rvp && (error == 0) ) + return rvp; + else + return -error; +} + + + +static int +dmapi_open(struct inode *inode, struct file *file) +{ + return 0; +} + + +static int +dmapi_release(struct inode *inode, struct file *file) +{ + return 0; +} + + +/* say hello, and let me know the device is hooked up */ +static ssize_t +dmapi_dump(struct file *file, char *buf, size_t count, loff_t *ppos) +{ + char tmp[50]; + int len; + if( *ppos == 0 ){ + len = sprintf( tmp, "# " DM_VER_STR_CONTENTS "\n" ); + if( copy_to_user(buf, tmp, len) ) + return -EFAULT; + *ppos += 1; + return len; + } + return 0; +} + +static struct file_operations dmapi_fops = { + open: dmapi_open, + ioctl: dmapi_ioctl, + read: dmapi_dump, + release: dmapi_release +}; + +static struct miscdevice dmapi_dev = { + minor: DMAPI_MINOR, + name: "dmapi", + fops: &dmapi_fops +}; + + + +#ifdef CONFIG_PROC_FS +static int +dmapi_summary(char *buffer, char **start, off_t offset, + int count, int *eof, void *data) +{ + int len; + + extern u_int dm_sessions_active; + extern dm_sessid_t dm_next_sessid; + extern dm_token_t dm_next_token; + extern dm_sequence_t dm_next_sequence; + extern int dm_fsys_cnt; + +#define CHKFULL if(len >= count) break; +#define ADDBUF(a,b) len += sprintf(buffer + len, a, b); CHKFULL; + + len=0; + while(1){ + ADDBUF("dm_sessions_active=%u\n", dm_sessions_active); + ADDBUF("dm_next_sessid=%d\n", (int)dm_next_sessid); + ADDBUF("dm_next_token=%d\n", (int)dm_next_token); + ADDBUF("dm_next_sequence=%u\n", (u_int)dm_next_sequence); + ADDBUF("dm_fsys_cnt=%d\n", dm_fsys_cnt); + + break; + } + + if (offset >= len) { + *start = buffer; + *eof = 1; + return 0; + } + *start = buffer + offset; + if ((len -= offset) > count) + return count; + *eof = 1; + + return len; +} +#endif + + +void +dmapi_init_procfs(void) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *entry; + + if ((entry = proc_mkdir( DMAPI_DBG_PROCFS, 0)) == NULL ) + return; + entry->owner = THIS_MODULE; + entry->mode = S_IFDIR | S_IRUSR | S_IXUSR; + + if ((entry = proc_mkdir( DMAPI_DBG_PROCFS "/fsreg", 0)) == NULL ) + return; + entry->owner = THIS_MODULE; + + if ((entry = proc_mkdir( DMAPI_DBG_PROCFS "/sessions", 0)) == NULL ) + return; + entry->owner = THIS_MODULE; + + entry = create_proc_read_entry( DMAPI_DBG_PROCFS "/summary", 0, 0, dmapi_summary, NULL); + entry->owner = THIS_MODULE; + + entry = proc_mknod( DMAPI_PROCFS, S_IFCHR | S_IRUSR | S_IWUSR, + NULL, MKDEV(MISC_MAJOR,DMAPI_MINOR)); + if( entry == NULL ) + return; + entry->owner = THIS_MODULE; +#endif +} + +void +dmapi_cleanup_procfs(void) +{ +#ifdef CONFIG_PROC_FS + remove_proc_entry( DMAPI_PROCFS, NULL); + remove_proc_entry( DMAPI_DBG_PROCFS "/summary", NULL); + remove_proc_entry( DMAPI_DBG_PROCFS "/fsreg", NULL); + remove_proc_entry( DMAPI_DBG_PROCFS "/sessions", NULL); + remove_proc_entry( DMAPI_DBG_PROCFS, NULL); +#endif +} + + +int __init dmapi_init(void) +{ + int ret; + + dm_tokdata_cachep = kmem_cache_create("dm_tokdata", + sizeof(struct dm_tokdata), 0, 0, NULL, NULL); + if (dm_tokdata_cachep == NULL) + return -ENOMEM; + + dm_fsreg_cachep = kmem_cache_create("dm_fsreg", + sizeof(struct dm_fsreg), 0, 0, NULL, NULL); + if (dm_fsreg_cachep == NULL) { + kmem_cache_destroy(dm_tokdata_cachep); + return -ENOMEM; + } + + dm_session_cachep = kmem_cache_create("dm_session", + sizeof(struct dm_session), 0, 0, NULL, NULL); + if (dm_session_cachep == NULL) { + kmem_cache_destroy(dm_tokdata_cachep); + kmem_cache_destroy(dm_fsreg_cachep); + return -ENOMEM; + } + + ret = misc_register(&dmapi_dev); + if( ret != 0 ) + printk(KERN_ERR "dmapi_init: misc_register returned %d\n", ret); + dmapi_init_procfs(); + return(0); +} + +void __exit dmapi_uninit(void) +{ + misc_deregister(&dmapi_dev); + dmapi_cleanup_procfs(); + kmem_cache_destroy(dm_tokdata_cachep); + kmem_cache_destroy(dm_fsreg_cachep); + kmem_cache_destroy(dm_session_cachep); + dm_fsys_vector_free(); +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/err.log linux-2.4-xfs/fs/xfs/err.log --- linux-2.4.19/fs/xfs/err.log Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/err.log Wed Sep 4 23:33:36 2002 @@ -0,0 +1,2 @@ +Makefile:150: /Rules.make: No such file or directory +make: *** No rule to make target `/Rules.make'. Stop. diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/Makefile linux-2.4-xfs/fs/xfs/linux/Makefile --- linux-2.4.19/fs/xfs/linux/Makefile Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/Makefile Thu Sep 5 15:35:08 2002 @@ -0,0 +1,65 @@ +# +# +# Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write the Free Software Foundation, Inc., 59 +# Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, +# Mountain View, CA 94043, or: +# +# http://www.sgi.com +# +# For further information regarding this notice, see: +# +# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ +# +# Makefile for XFS on Linux. + +# This needs -I.. because everything does #include instead of "xfs.h". +# The code is wrong, local files should be included using "xfs.h", not +# but I am not going to change every file at the moment. +EXTRA_CFLAGS += -I.. -funsigned-char + +ifeq ($(CONFIG_XFS_DEBUG),y) + EXTRA_CFLAGS += -g -DSTATIC="" -DDEBUG -DXFSDEBUG +endif + +O_TARGET := linux_xfs.o +ifneq ($(MAKECMDGOALS),modules_install) + obj-m := $(O_TARGET) +endif + +export-objs := xfs_globals.o + +obj-$(CONFIG_PROC_FS) += xfs_stats.o +obj-$(CONFIG_SYSCTL) += xfs_sysctl.o + +obj-y += xfs_aops.o \ + xfs_behavior.o \ + xfs_file.o \ + xfs_fs_subr.o \ + xfs_globals.o \ + xfs_ioctl.o \ + xfs_iops.o \ + xfs_lrw.o \ + xfs_super.o \ + xfs_vnode.o + +include $(TOPDIR)/Rules.make diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/Makefile.in linux-2.4-xfs/fs/xfs/linux/Makefile.in --- linux-2.4.19/fs/xfs/linux/Makefile.in Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/Makefile.in Thu Sep 5 15:35:08 2002 @@ -0,0 +1,44 @@ +# +# +# Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write the Free Software Foundation, Inc., 59 +# Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, +# Mountain View, CA 94043, or: +# +# http://www.sgi.com +# +# For further information regarding this notice, see: +# +# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ +# +# Makefile for XFS on Linux. + +expsyms(xfs_globals.o) + +objlink(linux_xfs.o xfs_behavior.o xfs_cred.o xfs_file.o xfs_fs_subr.o + xfs_globals.o xfs_ioctl.o xfs_iops.o xfs_lrw.o xfs_stats.o xfs_sysctl.o + xfs_super.o xfs_vfs.o xfs_vnode.o xfs_aops.o) + +# No select() for linux_xfs in this directory. It is a sub-component of XFS, +# see fs/xfs/Makefile.in for the objlink. + +extra_cflags_all($(XFS_EXTRA_CFLAGS)) diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_aops.c linux-2.4-xfs/fs/xfs/linux/xfs_aops.c --- linux-2.4.19/fs/xfs/linux/xfs_aops.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_aops.c Thu Sep 5 15:35:08 2002 @@ -0,0 +1,912 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include +#include +#include +#include +#include + + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,9) +#define page_buffers(page) ((page)->buffers) +#define page_has_buffers(page) ((page)->buffers) +#endif + +STATIC int linvfs_pb_bmap(struct inode *, loff_t, ssize_t, + struct page_buf_bmap_s *, int); +STATIC int delalloc_convert(struct inode *, struct page *, int, int); + +/* + * match_offset_to_mapping + * Finds the corresponding mapping in block @map array of the + * given @offset within a @page. + */ +STATIC page_buf_bmap_t * +match_offset_to_mapping( + struct page *page, + page_buf_bmap_t *map, + unsigned long offset) +{ + loff_t full_offset; /* offset from start of file */ + + ASSERT(offset < PAGE_CACHE_SIZE); + + full_offset = page->index; /* NB: using 64bit number */ + full_offset <<= PAGE_CACHE_SHIFT; /* offset from file start */ + full_offset += offset; /* offset from page start */ + + if (full_offset < map->pbm_offset) + return NULL; + if (map->pbm_offset + map->pbm_bsize > full_offset) + return map; + return NULL; +} + +STATIC void +map_buffer_at_offset( + struct page *page, + struct buffer_head *bh, + unsigned long offset, + int block_bits, + page_buf_bmap_t *mp) +{ + page_buf_daddr_t bn; + loff_t delta; + int sector_shift; + + ASSERT(!(mp->pbm_flags & PBMF_HOLE)); + ASSERT(!(mp->pbm_flags & PBMF_DELAY)); + ASSERT(!(mp->pbm_flags & PBMF_UNWRITTEN)); + ASSERT(mp->pbm_bn != PAGE_BUF_DADDR_NULL); + + delta = page->index; + delta <<= PAGE_CACHE_SHIFT; + delta += offset; + delta -= mp->pbm_offset; + delta >>= block_bits; + + sector_shift = block_bits - 9; + bn = mp->pbm_bn >> sector_shift; + bn += delta; + ASSERT((bn << sector_shift) >= mp->pbm_bn); + + lock_buffer(bh); + bh->b_blocknr = bn; + bh->b_dev = mp->pbm_target->pbr_kdev; + set_bit(BH_Mapped, &bh->b_state); + clear_bit(BH_Delay, &bh->b_state); +} + +/* + * Convert delalloc space to real space, do not flush the + * data out to disk, that will be done by the caller. + */ +STATIC int +release_page( + struct page *page) +{ + struct inode *inode = (struct inode*)page->mapping->host; + unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT; + int ret; + + /* Are we off the end of the file ? */ + if (page->index >= end_index) { + unsigned offset = inode->i_size & (PAGE_CACHE_SIZE-1); + if ((page->index >= end_index+1) || !offset) { + ret = -EIO; + goto out; + } + } + + ret = delalloc_convert(inode, page, 0, 0); + +out: + if (ret < 0) { + block_flushpage(page, 0); + ClearPageUptodate(page); + + return 0; + } + + return 1; +} + +/* + * Convert delalloc or unmapped space to real space and flush out + * to disk. + */ +STATIC int +write_full_page( + struct page *page, + int delalloc) +{ + struct inode *inode = (struct inode*)page->mapping->host; + unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT; + int ret; + + /* Are we off the end of the file ? */ + if (page->index >= end_index) { + unsigned offset = inode->i_size & (PAGE_CACHE_SIZE-1); + if ((page->index >= end_index+1) || !offset) { + ret = -EIO; + goto out; + } + } + + if (!page_has_buffers(page)) { + create_empty_buffers(page, inode->i_dev, 1 << inode->i_blkbits); + } + + ret = delalloc_convert(inode, page, 1, delalloc == 0); + +out: + if (ret < 0) { + /* + * If it's delalloc and we have nowhere to put it, + * throw it away. + */ + if (delalloc) + block_flushpage(page, 0); + ClearPageUptodate(page); + unlock_page(page); + } + + return ret; +} + +/* + * Look for a page at index which is unlocked and not mapped + * yet - clustering for mmap write case. + */ +STATIC unsigned int +probe_unmapped_page( + struct address_space *mapping, + unsigned long index, + unsigned int pg_offset) +{ + struct page *page; + int ret = 0; + + page = find_get_page(mapping, index); + if (!page) + return 0; + if (TryLockPage(page)) { + page_cache_release(page); + return 0; + } + if (page->mapping && PageDirty(page)) { + if (!page_has_buffers(page)) { + ret = PAGE_CACHE_SIZE; + } else { + struct buffer_head *bh, *head; + bh = head = page_buffers(page); + do { + if (buffer_mapped(bh) || !buffer_uptodate(bh)) { + break; + } + ret += bh->b_size; + if (ret >= pg_offset) + break; + } while ((bh = bh->b_this_page) != head); + } + } + + unlock_page(page); + page_cache_release(page); + return ret; +} + +STATIC unsigned int +probe_unmapped_cluster( + struct inode *inode, + struct page *startpage, + struct buffer_head *bh, + struct buffer_head *head) +{ + unsigned long tindex, tlast; + unsigned int len, total = 0; + struct address_space *mapping = inode->i_mapping; + + /* First sum forwards in this page */ + do { + if (buffer_mapped(bh)) + break; + total += bh->b_size; + } while ((bh = bh->b_this_page) != head); + + /* if we reached the end of the page, sum forwards in + * following pages. + */ + if (bh == head) { + tlast = inode->i_size >> PAGE_CACHE_SHIFT; + for (tindex = startpage->index + 1; tindex < tlast; tindex++) { + len = probe_unmapped_page(mapping, tindex, + PAGE_CACHE_SIZE); + if (!len) + break; + total += len; + } + if ((tindex == tlast) && (inode->i_size & ~PAGE_CACHE_MASK)) { + len = probe_unmapped_page(mapping, tindex, + inode->i_size & ~PAGE_CACHE_MASK); + total += len; + } + } + return total; +} + +/* + * Probe for a given page (index) in the inode & test if it is delayed. + * Returns page locked and with an extra reference count. + */ +STATIC struct page * +probe_page( + struct inode *inode, + unsigned long index) +{ + struct page *page; + + page = find_get_page(inode->i_mapping, index); + if (!page) + return NULL; + if (TryLockPage(page)) { + page_cache_release(page); + return NULL; + } + if (page->mapping && page_has_buffers(page)) { + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + if (buffer_delay(bh)) + return page; + } while ((bh = bh->b_this_page) != head); + } + unlock_page(page); + page_cache_release(page); + return NULL; +} + +STATIC void +submit_page( + struct page *page, + struct buffer_head *bh_arr[], + int cnt) +{ + struct buffer_head *bh; + int i; + + if (cnt) { + for (i = 0; i < cnt; i++) { + bh = bh_arr[i]; + set_buffer_async_io(bh); + set_bit(BH_Uptodate, &bh->b_state); + clear_bit(BH_Dirty, &bh->b_state); + } + + for (i = 0; i < cnt; i++) + submit_bh(WRITE, bh_arr[i]); + } else + unlock_page(page); +} + +STATIC int +map_page( + struct inode *inode, + struct page *page, + page_buf_bmap_t *maps, + struct buffer_head *bh_arr[], + int startio, + int all_bh) +{ + struct buffer_head *bh, *head; + page_buf_bmap_t *mp = maps, *tmp; + unsigned long end, offset, end_index; + int i = 0, index = 0; + int bbits = inode->i_blkbits; + + end_index = inode->i_size >> PAGE_CACHE_SHIFT; + if (page->index < end_index) { + end = PAGE_CACHE_SIZE; + } else { + end = inode->i_size & (PAGE_CACHE_SIZE-1); + } + bh = head = page_buffers(page); + do { + offset = i << bbits; + if (!buffer_uptodate(bh)) + continue; + if (buffer_mapped(bh) && !buffer_delay(bh) && all_bh) { + if (startio && (offset < end)) { + lock_buffer(bh); + bh_arr[index++] = bh; + } + continue; + } + tmp = match_offset_to_mapping(page, mp, offset); + if (!tmp) + continue; + ASSERT(!(tmp->pbm_flags & PBMF_HOLE)); + ASSERT(!(tmp->pbm_flags & PBMF_DELAY)); + map_buffer_at_offset(page, bh, offset, bbits, tmp); + if (startio && (offset < end)) { + bh_arr[index++] = bh; + } else { + unlock_buffer(bh); + } + } while (i++, (bh = bh->b_this_page) != head); + + return index; +} + +/* + * Allocate & map buffers for page given the extent map. Write it out. + * except for the original page of a writepage, this is called on + * delalloc pages only, for the original page it is possible that + * the page has no mapping at all. + */ +STATIC void +convert_page( + struct inode *inode, + struct page *page, + page_buf_bmap_t *maps, + int startio, + int all_bh) +{ + struct buffer_head *bh_arr[MAX_BUF_PER_PAGE]; + int cnt; + + cnt = map_page(inode, page, maps, bh_arr, startio, all_bh); + submit_page(page, bh_arr, cnt); + page_cache_release(page); +} + +/* + * Convert & write out a cluster of pages in the same extent as defined + * by mp and following the start page. + */ +STATIC void +cluster_write( + struct inode *inode, + unsigned long tindex, + page_buf_bmap_t *mp, + int startio, + int all_bh) +{ + unsigned long tlast; + struct page *page; + + tlast = (mp->pbm_offset + mp->pbm_bsize) >> PAGE_CACHE_SHIFT; + for (; tindex < tlast; tindex++) { + if (!(page = probe_page(inode, tindex))) + break; + convert_page(inode, page, mp, startio, all_bh); + } +} + +/* + * Calling this without allocate_space set means we are being asked to + * flush a dirty buffer head. When called with async_write set then we + * are coming from writepage. A writepage call with allocate_space set + * means we are being asked to write out all of the page which is before + * EOF and therefore need to allocate space for unmapped portions of the + * page. + */ +STATIC int +delalloc_convert( + struct inode *inode, /* inode containing page */ + struct page *page, /* page to convert - locked */ + int startio, /* start io on the page */ + int allocate_space) +{ + struct buffer_head *bh, *head; + struct buffer_head *bh_arr[MAX_BUF_PER_PAGE]; + page_buf_bmap_t *mp, map; + int i, cnt = 0; + int len, err; + unsigned long p_offset = 0; + loff_t offset; + loff_t end_offset; + + offset = (loff_t)page->index << PAGE_CACHE_SHIFT; + end_offset = offset + PAGE_CACHE_SIZE; + if (end_offset > inode->i_size) + end_offset = inode->i_size; + + bh = head = page_buffers(page); + mp = NULL; + + len = bh->b_size; + do { + if (!buffer_uptodate(bh) && !startio) { + goto next_bh; + } + + if (mp) { + mp = match_offset_to_mapping(page, &map, p_offset); + } + + if (buffer_delay(bh)) { + if (!mp) { + err = linvfs_pb_bmap(inode, offset, len, &map, + PBF_WRITE|PBF_FILE_ALLOCATE); + if (err) + goto error; + mp = match_offset_to_mapping(page, &map, + p_offset); + } + if (mp) { + map_buffer_at_offset(page, bh, p_offset, + inode->i_blkbits, mp); + if (startio) { + bh_arr[cnt++] = bh; + } else { + unlock_buffer(bh); + } + } + } else if (!buffer_mapped(bh) && allocate_space) { + int size; + + /* Getting here implies an unmapped buffer was found, + * and we are in a path where we need to write the + * whole page out. + */ + if (!mp) { + size = probe_unmapped_cluster(inode, page, + bh, head); + err = linvfs_pb_bmap(inode, offset, size, &map, + PBF_WRITE|PBF_DIRECT); + if (err) + goto error; + mp = match_offset_to_mapping(page, &map, + p_offset); + } + if (mp) { + map_buffer_at_offset(page, bh, p_offset, + inode->i_blkbits, mp); + if (startio) { + bh_arr[cnt++] = bh; + } else { + unlock_buffer(bh); + } + } + } else if (startio && buffer_mapped(bh)) { + if (buffer_dirty(bh) || allocate_space) { + lock_buffer(bh); + bh_arr[cnt++] = bh; + } + } + +next_bh: + offset += len; + p_offset += len; + bh = bh->b_this_page; + } while (offset < end_offset); + + if (startio) + submit_page(page, bh_arr, cnt); + + if (mp) + cluster_write(inode, page->index + 1, mp, + startio, allocate_space); + + return 0; + +error: + for (i = 0; i < cnt; i++) { + unlock_buffer(bh_arr[i]); + } + + return err; +} + +STATIC int +linvfs_get_block_core( + struct inode *inode, + long iblock, + struct buffer_head *bh_result, + int create, + int direct, + page_buf_flags_t flags) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + page_buf_bmap_t pbmap; + int retpbbm = 1; + int error; + ssize_t size; + loff_t offset = (loff_t)iblock << inode->i_blkbits; + + /* If we are doing writes at the end of the file, + * allocate in chunks + */ + if (create && (offset >= inode->i_size) && !(flags & PBF_SYNC)) + size = 1 << XFS_WRITE_IO_LOG; + else + size = 1 << inode->i_blkbits; + + VOP_BMAP(vp, offset, size, + create ? flags : PBF_READ, NULL, + (struct page_buf_bmap_s *)&pbmap, &retpbbm, error); + if (error) + return -error; + + if (retpbbm == 0) + return 0; + + if (pbmap.pbm_bn != PAGE_BUF_DADDR_NULL) { + page_buf_daddr_t bn; + loff_t delta; + + delta = offset - pbmap.pbm_offset; + delta >>= inode->i_blkbits; + + bn = pbmap.pbm_bn >> (inode->i_blkbits - 9); + bn += delta; + + bh_result->b_blocknr = bn; + set_bit(BH_Mapped, &bh_result->b_state); + } + + /* If we previously allocated a block out beyond eof and + * we are now coming back to use it then we will need to + * flag it as new even if it has a disk address. + */ + if (create && + ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || + (offset >= inode->i_size))) { + set_bit(BH_New, &bh_result->b_state); + } + + if (pbmap.pbm_flags & PBMF_DELAY) { + if (unlikely(direct)) + BUG(); + + if (create) { + set_bit(BH_Mapped, &bh_result->b_state); + set_bit(BH_Uptodate, &bh_result->b_state); + } + set_bit(BH_Delay, &bh_result->b_state); + } + + return 0; +} + +int +linvfs_get_block( + struct inode *inode, + long iblock, + struct buffer_head *bh_result, + int create) +{ + return linvfs_get_block_core(inode, iblock, bh_result, + create, 0, PBF_WRITE); +} + +STATIC int +linvfs_get_block_sync( + struct inode *inode, + long iblock, + struct buffer_head *bh_result, + int create) +{ + return linvfs_get_block_core(inode, iblock, bh_result, + create, 0, PBF_SYNC|PBF_WRITE); +} + + +STATIC int +linvfs_get_block_direct( + struct inode *inode, + long iblock, + struct buffer_head *bh_result, + int create) +{ + return linvfs_get_block_core(inode, iblock, bh_result, + create, 1, PBF_WRITE|PBF_DIRECT); +} + +STATIC int +linvfs_pb_bmap( + struct inode *inode, + loff_t offset, + ssize_t count, + page_buf_bmap_t *pbmapp, + int flags) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + int error, nmaps = 1; + +retry: + if (flags & PBF_FILE_ALLOCATE) { + VOP_STRATEGY(vp, offset, count, flags, NULL, + pbmapp, &nmaps, error); + } else { + VOP_BMAP(vp, offset, count, flags, NULL, + pbmapp, &nmaps, error); + } + if (flags & PBF_WRITE) { + if (unlikely((flags & PBF_DIRECT) && nmaps && + (pbmapp->pbm_flags & PBMF_DELAY))) { + flags = PBF_WRITE | PBF_FILE_ALLOCATE; + goto retry; + } + VMODIFY(vp); + } + return -error; +} + +STATIC int +linvfs_bmap( + struct address_space *mapping, + long block) +{ + struct inode *inode = (struct inode *)mapping->host; + vnode_t *vp = LINVFS_GET_VP(inode); + int error; + + /* block - Linux disk blocks 512b */ + /* bmap input offset - bytes 1b */ + /* bmap output bn - XFS BBs 512b */ + /* bmap output delta - bytes 1b */ + + vn_trace_entry(vp, "linvfs_bmap", (inst_t *)__return_address); + + VOP_RWLOCK(vp, VRWLOCK_READ); + VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error); + VOP_RWUNLOCK(vp, VRWLOCK_READ); + return generic_block_bmap(mapping, block, linvfs_get_block_direct); +} + +STATIC int +linvfs_read_full_page( + struct file *unused, + struct page *page) +{ + return block_read_full_page(page, linvfs_get_block); +} + +STATIC int +count_page_state( + struct page *page, + int *nr_delalloc, + int *nr_unmapped) +{ + *nr_delalloc = *nr_unmapped = 0; + + if (page_has_buffers(page)) { + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + if (buffer_uptodate(bh) && !buffer_mapped(bh)) { + (*nr_unmapped)++; + continue; + } + if (!buffer_delay(bh)) + continue; + (*nr_delalloc)++; + } while ((bh = bh->b_this_page) != head); + return 1; + } + + return 0; +} + +STATIC int +linvfs_write_full_page( + struct page *page) +{ + int flagset = 0; + int error; + int need_trans; + int nr_delalloc, nr_unmapped; + + if (count_page_state(page, &nr_delalloc, &nr_unmapped)) { + need_trans = nr_delalloc + nr_unmapped; + } else { + need_trans = 1; + } + + if ((current->flags & (PF_FSTRANS|PF_NOIO)) && need_trans) + goto out_fail; + + if (need_trans) { + current->flags |= PF_NOIO; + flagset = 1; + } + + error = write_full_page(page, nr_delalloc); + + if (flagset) + current->flags &= ~PF_NOIO; + return error; + +out_fail: + SetPageDirty(page); + unlock_page(page); + return 0; +} + +STATIC int +linvfs_prepare_write( + struct file *file, + struct page *page, + unsigned int from, + unsigned int to) +{ + if (file && (file->f_flags & O_SYNC)) { + return block_prepare_write(page, from, to, + linvfs_get_block_sync); + } else { + return block_prepare_write(page, from, to, + linvfs_get_block); + } +} + +/* + * Initiate I/O on a kiobuf of user memory + */ +STATIC int +linvfs_direct_IO( + int rw, + struct inode *inode, + struct kiobuf *iobuf, + unsigned long blocknr, + int blocksize) +{ + struct page **maplist; + size_t page_offset; + page_buf_t *pb; + page_buf_bmap_t map; + int error = 0; + int pb_flags, map_flags, pg_index = 0; + size_t length, total; + loff_t offset; + size_t map_size, size; + + total = length = iobuf->length; + offset = blocknr; + offset <<= inode->i_blkbits; + + maplist = iobuf->maplist; + page_offset = iobuf->offset; + + map_flags = (rw ? PBF_WRITE : PBF_READ) | PBF_DIRECT; + pb_flags = (rw ? PBF_WRITE : PBF_READ) | PBF_FORCEIO | _PBF_LOCKABLE; + while (length) { + error = linvfs_pb_bmap(inode, offset, length, &map, map_flags); + if (error) + break; + + map_size = map.pbm_bsize - map.pbm_delta; + size = min(map_size, length); + if (map.pbm_flags & PBMF_HOLE) { + size_t zero_len = size; + + if (rw == WRITE) + break; + + /* Need to zero it all */ + while (zero_len) { + struct page *page; + size_t pg_len; + + pg_len = min((size_t) + (PAGE_CACHE_SIZE - page_offset), + zero_len); + + page = maplist[pg_index]; + + memset(kmap(page) + page_offset, 0, pg_len); + flush_dcache_page(page); + kunmap(page); + + zero_len -= pg_len; + if ((pg_len + page_offset) == PAGE_CACHE_SIZE) { + pg_index++; + page_offset = 0; + } else { + page_offset = (page_offset + pg_len) & + ~PAGE_CACHE_MASK; + } + } + } else { + int pg_count; + + pg_count = (size + page_offset + PAGE_CACHE_SIZE - 1) + >> PAGE_CACHE_SHIFT; + pb = pagebuf_lookup(map.pbm_target, inode, offset, + size, pb_flags); + /* Need to hook up pagebuf to kiobuf pages */ + pb->pb_pages = &maplist[pg_index]; + pb->pb_offset = page_offset; + pb->pb_page_count = pg_count; + + pb->pb_bn = map.pbm_bn + (map.pbm_delta >> 9); + pagebuf_iostart(pb, pb_flags); + pb->pb_flags &= ~_PBF_LOCKABLE; + pagebuf_rele(pb); + + page_offset = (page_offset + size) & ~PAGE_CACHE_MASK; + if (page_offset) + pg_count--; + pg_index += pg_count; + } + + offset += size; + length -= size; + } + + return error ? error : total - length; +} + +/* + * This gets a page into cleanable state - page locked on entry + * kept locked on exit. If the page is marked dirty we should + * not come this way. + */ +STATIC int +linvfs_release_page( + struct page *page, + int gfp_mask) +{ + int need_trans; + int nr_delalloc, nr_unmapped; + + if (count_page_state(page, &nr_delalloc, &nr_unmapped)) { + need_trans = nr_delalloc; + } else { + need_trans = 0; + } + + if (need_trans == 0) { + return 1; + } + + if (gfp_mask & __GFP_FS) { + return release_page(page); + } + return 0; +} + + +struct address_space_operations linvfs_aops = { + .readpage = linvfs_read_full_page, + .writepage = linvfs_write_full_page, + .sync_page = block_sync_page, + .releasepage = linvfs_release_page, + .prepare_write = linvfs_prepare_write, + .commit_write = generic_commit_write, + .bmap = linvfs_bmap, + .direct_IO = linvfs_direct_IO, +}; diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_behavior.c linux-2.4-xfs/fs/xfs/linux/xfs_behavior.c --- linux-2.4.19/fs/xfs/linux/xfs_behavior.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_behavior.c Wed Aug 21 01:28:40 2002 @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + * + */ + +/* + * Source file used to associate/disassociate behaviors with virtualized + * objects. See behavior.h for more information about behaviors, etc. + * + * The implementation is split between functions in this file and macros + * in behavior.h. + */ +#include + +kmem_zone_t *bhv_global_zone; + +/* + * Global initialization function called out of main. + */ +void +bhv_global_init(void) +{ + /* + * Initialize a behavior zone used by subsystems using behaviors + * but without any private data. In the UNIKERNEL case, this zone + * is used only for behaviors that are not yet isolated to a single + * cell. The only such user is in pshm.c in which a dummy vnode is + * obtained in support of vce avoidance logic. + */ + bhv_global_zone = kmem_zone_init(sizeof(bhv_desc_t), "bhv_global_zone"); +} + +/* + * Remove a behavior descriptor from a position in a behavior chain; + * the postition is guaranteed not to be the first position. + * Should only be called by the bhv_remove() macro. + * + * The act of modifying the chain is done atomically w.r.t. ops-in-progress + * (see comment at top of behavior.h for more info on synchronization). + */ +void +bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp) +{ + bhv_desc_t *curdesc, *prev; + + ASSERT(bhp->bh_first != NULL); + ASSERT(bhp->bh_first->bd_next != NULL); + + prev = bhp->bh_first; + for (curdesc = bhp->bh_first->bd_next; + curdesc != NULL; + curdesc = curdesc->bd_next) { + + if (curdesc == bdp) + break; /* found it */ + prev = curdesc; + } + + ASSERT(curdesc == bdp); + prev->bd_next = bdp->bd_next; /* remove from after prev */ + /* atomic wrt oip's */ +} + +/* + * Look for a specific ops vector on the specified behavior chain. + * Return the associated behavior descriptor. Or NULL, if not found. + */ +bhv_desc_t * +bhv_lookup(bhv_head_t *bhp, void *ops) +{ + bhv_desc_t *curdesc; + + for (curdesc = bhp->bh_first; + curdesc != NULL; + curdesc = curdesc->bd_next) { + + if (curdesc->bd_ops == ops) + return curdesc; + } + + return NULL; +} + +/* + * Look for a specific ops vector on the specified behavior chain. + * Return the associated behavior descriptor. Or NULL, if not found. + * + * The caller has not read locked the behavior chain, so acquire the + * lock before traversing the chain. + */ +bhv_desc_t * +bhv_lookup_unlocked(bhv_head_t *bhp, void *ops) +{ + bhv_desc_t *bdp; + + BHV_READ_LOCK(bhp); + bdp = bhv_lookup(bhp, ops); + BHV_READ_UNLOCK(bhp); + + return bdp; +} + +/* + * Return the base behavior in the chain, or NULL if the chain + * is empty. + * + * The caller has not read locked the behavior chain, so acquire the + * lock before traversing the chain. + */ +bhv_desc_t * +bhv_base_unlocked(bhv_head_t *bhp) +{ + bhv_desc_t *curdesc; + + BHV_READ_LOCK(bhp); + for (curdesc = bhp->bh_first; + curdesc != NULL; + curdesc = curdesc->bd_next) { + + if (curdesc->bd_next == NULL) { + BHV_READ_UNLOCK(bhp); + return curdesc; + } + } + + BHV_READ_UNLOCK(bhp); + return NULL; +} + +#define BHVMAGIC (void *)0xf00d + +/* ARGSUSED */ +void +bhv_head_init( + bhv_head_t *bhp, + char *name) +{ + bhp->bh_first = NULL; + bhp->bh_lockp = BHVMAGIC; +} + + +/* ARGSUSED */ +void +bhv_head_reinit( + bhv_head_t *bhp) +{ + ASSERT(bhp->bh_first == NULL); + ASSERT(bhp->bh_lockp == BHVMAGIC); +} + + +void +bhv_insert_initial( + bhv_head_t *bhp, + bhv_desc_t *bdp) +{ + ASSERT(bhp->bh_first == NULL); + ASSERT(bhp->bh_lockp == BHVMAGIC); + (bhp)->bh_first = bdp; +} + +void +bhv_head_destroy( + bhv_head_t *bhp) +{ + ASSERT(bhp->bh_first == NULL); + ASSERT(bhp->bh_lockp == BHVMAGIC); + bhp->bh_lockp = NULL; +} + diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_behavior.h linux-2.4-xfs/fs/xfs/linux/xfs_behavior.h --- linux-2.4.19/fs/xfs/linux/xfs_behavior.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_behavior.h Fri Aug 23 14:22:46 2002 @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2000, 2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_BEHAVIOR_H__ +#define __XFS_BEHAVIOR_H__ + +/* + * Header file used to associate behaviors with virtualized objects. + * + * A virtualized object is an internal, virtualized representation of + * OS entities such as persistent files, processes, or sockets. Examples + * of virtualized objects include vnodes, vprocs, and vsockets. Often + * a virtualized object is referred to simply as an "object." + * + * A behavior is essentially an implementation layer associated with + * an object. Multiple behaviors for an object are chained together, + * the order of chaining determining the order of invocation. Each + * behavior of a given object implements the same set of interfaces + * (e.g., the VOP interfaces). + * + * Behaviors may be dynamically inserted into an object's behavior chain, + * such that the addition is transparent to consumers that already have + * references to the object. Typically, a given behavior will be inserted + * at a particular location in the behavior chain. Insertion of new + * behaviors is synchronized with operations-in-progress (oip's) so that + * the oip's always see a consistent view of the chain. + * + * The term "interpostion" is used to refer to the act of inserting + * a behavior such that it interposes on (i.e., is inserted in front + * of) a particular other behavior. A key example of this is when a + * system implementing distributed single system image wishes to + * interpose a distribution layer (providing distributed coherency) + * in front of an object that is otherwise only accessed locally. + * + * Note that the traditional vnode/inode combination is simply a virtualized + * object that has exactly one associated behavior. + * + * Behavior synchronization is logic which is necessary under certain + * circumstances that there is no conflict between ongoing operations + * traversing the behavior chain and those dunamically modifying the + * behavior chain. Because behavior synchronization adds extra overhead + * to virtual operation invocation, we want to restrict, as much as + * we can, the requirement for this extra code, to those situations + * in which it is truly necessary. + * + * Behavior synchronization is needed whenever there's at least one class + * of object in the system for which: + * 1) multiple behaviors for a given object are supported, + * -- AND -- + * 2a) insertion of a new behavior can happen dynamically at any time during + * the life of an active object, + * -- AND -- + * 3a) insertion of a new behavior needs to synchronize with existing + * ops-in-progress. + * -- OR -- + * 3b) multiple different behaviors can be dynamically inserted at + * any time during the life of an active object + * -- OR -- + * 3c) removal of a behavior can occur at any time during the life of + * an active object. + * -- OR -- + * 2b) removal of a behavior can occur at any time during the life of an + * active object + * + */ + +typedef void bhv_head_lock_t; + +/* + * Behavior head. Head of the chain of behaviors. + * Contained within each virtualized object data structure. + */ +typedef struct bhv_head { + struct bhv_desc *bh_first; /* first behavior in chain */ + bhv_head_lock_t *bh_lockp; /* pointer to lock info struct */ +} bhv_head_t; + +/* + * Behavior descriptor. Descriptor associated with each behavior. + * Contained within the behavior's private data structure. + */ +typedef struct bhv_desc { + void *bd_pdata; /* private data for this behavior */ + void *bd_vobj; /* virtual object associated with */ + void *bd_ops; /* ops for this behavior */ + struct bhv_desc *bd_next; /* next behavior in chain */ +} bhv_desc_t; + +/* + * Behavior identity field. A behavior's identity determines the position + * where it lives within a behavior chain, and it's always the first field + * of the behavior's ops vector. The optional id field further identifies the + * subsystem responsible for the behavior. + */ +typedef struct bhv_identity { + __u16 bi_id; /* owning subsystem id */ + __u16 bi_position; /* position in chain */ +} bhv_identity_t; + +typedef bhv_identity_t bhv_position_t; + +#define BHV_IDENTITY_INIT(id,pos) {id, pos} + +#define BHV_IDENTITY_INIT_POSITION(pos) BHV_IDENTITY_INIT(0, pos) + + +/* + * Define boundaries of position values. + */ +#define BHV_POSITION_INVALID 0 /* invalid position number */ +#define BHV_POSITION_BASE 1 /* base (last) implementation layer */ +#define BHV_POSITION_TOP 63 /* top (first) implementation layer */ + +/* + * Plumbing macros. + */ +#define BHV_HEAD_FIRST(bhp) (ASSERT((bhp)->bh_first), (bhp)->bh_first) +#define BHV_NEXT(bdp) (ASSERT((bdp)->bd_next), (bdp)->bd_next) +#define BHV_NEXTNULL(bdp) ((bdp)->bd_next) +#define BHV_VOBJ(bdp) (ASSERT((bdp)->bd_vobj), (bdp)->bd_vobj) +#define BHV_VOBJNULL(bdp) ((bdp)->bd_vobj) +#define BHV_PDATA(bdp) (bdp)->bd_pdata +#define BHV_OPS(bdp) (bdp)->bd_ops +#define BHV_IDENTITY(bdp) ((bhv_identity_t *)(bdp)->bd_ops) +#define BHV_POSITION(bdp) (BHV_IDENTITY(bdp)->bi_position) + + +#define BHV_READ_LOCK(bhp) +#define BHV_READ_UNLOCK(bhp) +#define BHV_NOT_READ_LOCKED(bhp) 1 +#define BHV_IS_WRITE_LOCKED(bhp) 1 +#define BHV_NOT_WRITE_LOCKED(bhp) 1 + +extern void bhv_head_init(bhv_head_t *, char *); +extern void bhv_head_destroy(bhv_head_t *); +extern void bhv_head_reinit(bhv_head_t *); +extern void bhv_insert_initial(bhv_head_t *, bhv_desc_t *); + +/* + * Initialize a new behavior descriptor. + * Arguments: + * bdp - pointer to behavior descriptor + * pdata - pointer to behavior's private data + * vobj - pointer to associated virtual object + * ops - pointer to ops for this behavior + */ +#define bhv_desc_init(bdp, pdata, vobj, ops) \ + { \ + (bdp)->bd_pdata = pdata; \ + (bdp)->bd_vobj = vobj; \ + (bdp)->bd_ops = ops; \ + (bdp)->bd_next = NULL; \ + } + +#define BHV_DESC_INIT(so,A,B) bhv_desc_init(&(so->so_bhv),so,A,B) + +/* + * Remove a behavior descriptor from a behavior chain. + */ +#define bhv_remove(bhp, bdp) \ + { \ + if ((bhp)->bh_first == (bdp)) { \ + /* \ + * Remove from front of chain. \ + * Atomic wrt oip's. \ + */ \ + (bhp)->bh_first = (bdp)->bd_next; \ + } else { \ + /* remove from non-front of chain */ \ + bhv_remove_not_first(bhp, bdp); \ + } \ + (bdp)->bd_vobj = NULL; \ + } + +/* + * Behavior module prototypes. + */ +extern void bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp); +extern bhv_desc_t * bhv_lookup(bhv_head_t *bhp, void *ops); +extern bhv_desc_t * bhv_lookup_unlocked(bhv_head_t *bhp, void *ops); +extern bhv_desc_t * bhv_base_unlocked(bhv_head_t *bhp); + +#endif /* __XFS_BEHAVIOR_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_cred.h linux-2.4-xfs/fs/xfs/linux/xfs_cred.h --- linux-2.4.19/fs/xfs/linux/xfs_cred.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_cred.h Wed Aug 28 23:22:09 2002 @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_CRED_H__ +#define __XFS_CRED_H__ + +/* + * Credentials + */ +typedef struct cred { + /* EMPTY */ +} cred_t; + +extern struct cred *sys_cred; + +/* this is a hack.. (assums sys_cred is the only cred_t in the system) */ +static __inline int capable_cred(cred_t *cr, int cid) +{ + return (cr == sys_cred) ? 1 : capable(cid); +} + +#endif /* __XFS_CRED_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_file.c linux-2.4-xfs/fs/xfs/linux/xfs_file.c --- linux-2.4.19/fs/xfs/linux/xfs_file.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_file.c Wed Jul 31 11:49:51 2002 @@ -0,0 +1,334 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include +#include +#include +#include +#include /* for PROT_WRITE */ + +static struct vm_operations_struct linvfs_file_vm_ops; + + +STATIC ssize_t +linvfs_read( + struct file *filp, + char *buf, + size_t size, + loff_t *offset) +{ + vnode_t *vp; + int error; + + vp = LINVFS_GET_VP(filp->f_dentry->d_inode); + ASSERT(vp); + + VOP_READ(vp, filp, buf, size, offset, NULL, error); + + return(error); +} + + +STATIC ssize_t +linvfs_write( + struct file *file, + const char *buf, + size_t count, + loff_t *ppos) +{ + struct inode *inode = file->f_dentry->d_inode; + loff_t pos; + vnode_t *vp; + int err; /* Use negative errors in this f'n */ + + if ((ssize_t) count < 0) + return -EINVAL; + + if (!access_ok(VERIFY_READ, buf, count)) + return -EFAULT; + + pos = *ppos; + err = -EINVAL; + if (pos < 0) + goto out; + + err = file->f_error; + if (err) { + file->f_error = 0; + goto out; + } + + vp = LINVFS_GET_VP(inode); + ASSERT(vp); + + /* We allow multiple direct writers in, there is no + * potential call to vmtruncate in that path. + */ + if (!(file->f_flags & O_DIRECT)) + down(&inode->i_sem); + + VOP_WRITE(vp, file, buf, count, &pos, NULL, err); + *ppos = pos; + + if (!(file->f_flags & O_DIRECT)) + up(&inode->i_sem); +out: + + return(err); +} + + +STATIC int +linvfs_open( + struct inode *inode, + struct file *filp) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + int error; + + if (!(filp->f_flags & O_LARGEFILE) && inode->i_size > MAX_NON_LFS) + return -EFBIG; + + ASSERT(vp); + VOP_OPEN(vp, NULL, error); + return -error; +} + + +STATIC int +linvfs_release( + struct inode *inode, + struct file *filp) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + int error = 0; + + if (vp) + VOP_RELEASE(vp, error); + return -error; +} + + +STATIC int +linvfs_fsync( + struct file *filp, + struct dentry *dentry, + int datasync) +{ + struct inode *inode = dentry->d_inode; + vnode_t *vp = LINVFS_GET_VP(inode); + int error; + int flags = FSYNC_WAIT; + + if (datasync) + flags |= FSYNC_DATA; + + ASSERT(vp); + + VOP_FSYNC(vp, flags, NULL, (off_t)0, (off_t)-1, error); + + return -error; +} + +/* + * linvfs_readdir maps to VOP_READDIR(). + * We need to build a uio, cred, ... + */ + +#define nextdp(dp) ((struct xfs_dirent *)((char *)(dp) + (dp)->d_reclen)) + +STATIC int +linvfs_readdir( + struct file *filp, + void *dirent, + filldir_t filldir) +{ + int error = 0; + vnode_t *vp; + uio_t uio; + iovec_t iov; + int eof = 0; + caddr_t read_buf; + int namelen, size = 0; + size_t rlen = PAGE_CACHE_SIZE << 2; + off_t start_offset; + xfs_dirent_t *dbp = NULL; + + vp = LINVFS_GET_VP(filp->f_dentry->d_inode); + ASSERT(vp); + + /* Try fairly hard to get memory */ + do { + if ((read_buf = (caddr_t)kmalloc(rlen, GFP_KERNEL))) + break; + rlen >>= 1; + } while (rlen >= 1024); + + if (read_buf == NULL) + return -ENOMEM; + + uio.uio_iov = &iov; + uio.uio_fmode = filp->f_mode; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_offset = filp->f_pos; + + while (!eof) { + uio.uio_resid = iov.iov_len = rlen; + iov.iov_base = read_buf; + uio.uio_iovcnt = 1; + + start_offset = uio.uio_offset; + + VOP_READDIR(vp, &uio, NULL, &eof, error); + if ((uio.uio_offset == start_offset) || error) { + size = 0; + break; + } + + size = rlen - uio.uio_resid; + dbp = (xfs_dirent_t *)read_buf; + while (size > 0) { + namelen = strlen(dbp->d_name); + + if (filldir(dirent, dbp->d_name, namelen, + (loff_t) dbp->d_off, + (ino_t) dbp->d_ino, + DT_UNKNOWN)) { + goto done; + } + size -= dbp->d_reclen; + dbp = nextdp(dbp); + } + } +done: + if (!error) { + if (size == 0) + filp->f_pos = uio.uio_offset; + else if (dbp) + filp->f_pos = dbp->d_off; + } + + kfree(read_buf); + return -error; +} + +STATIC int +linvfs_file_mmap( + struct file *filp, + struct vm_area_struct *vma) +{ + struct inode *ip = filp->f_dentry->d_inode; + vnode_t *vp = LINVFS_GET_VP(ip); + vattr_t va = { .va_mask = AT_UPDATIME }; + int error; + + if ((vp->v_type == VREG) && (vp->v_vfsp->vfs_flag & VFS_DMI)) { + error = -xfs_dm_send_mmap_event(vma, 0); + if (error) + return error; + } + + vma->vm_ops = &linvfs_file_vm_ops; + + VOP_SETATTR(vp, &va, AT_UPDATIME, NULL, error); + UPDATE_ATIME(ip); + return 0; +} + + +STATIC int +linvfs_ioctl( + struct inode *inode, + struct file *filp, + unsigned int cmd, + unsigned long arg) +{ + int error; + vnode_t *vp = LINVFS_GET_VP(inode); + + ASSERT(vp); + VOP_IOCTL(vp, inode, filp, cmd, arg, error); + VMODIFY(vp); + + /* NOTE: some of the ioctl's return positive #'s as a + * byte count indicating success, such as + * readlink_by_handle. So we don't "sign flip" + * like most other routines. This means true + * errors need to be returned as a negative value. + */ + return error; +} + +#ifdef HAVE_VMOP_MPROTECT +STATIC int +linvfs_mprotect( + struct vm_area_struct *vma, + unsigned int newflags) +{ + vnode_t *vp = LINVFS_GET_VP(vma->vm_file->f_dentry->d_inode); + int error = 0; + + if ((vp->v_type == VREG) && (vp->v_vfsp->vfs_flag & VFS_DMI)) { + if ((vma->vm_flags & VM_MAYSHARE) && + (newflags & PROT_WRITE) && !(vma->vm_flags & PROT_WRITE)){ + error = xfs_dm_send_mmap_event(vma, VM_WRITE); + } + } + return error; +} +#endif /* HAVE_VMOP_MPROTECT */ + + +struct file_operations linvfs_file_operations = { + .llseek = generic_file_llseek, + .read = linvfs_read, + .write = linvfs_write, + .ioctl = linvfs_ioctl, + .mmap = linvfs_file_mmap, + .open = linvfs_open, + .release = linvfs_release, + .fsync = linvfs_fsync, +}; + +struct file_operations linvfs_dir_operations = { + .read = generic_read_dir, + .readdir = linvfs_readdir, + .ioctl = linvfs_ioctl, + .fsync = linvfs_fsync, +}; + +static struct vm_operations_struct linvfs_file_vm_ops = { + .nopage = filemap_nopage, +#ifdef HAVE_VMOP_MPROTECT + .mprotect = linvfs_mprotect, +#endif +}; diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_fs_subr.c linux-2.4-xfs/fs/xfs/linux/xfs_fs_subr.c --- linux-2.4.19/fs/xfs/linux/xfs_fs_subr.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_fs_subr.c Sat Aug 24 17:08:32 2002 @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include + +/* + * Implementation for VFS_DOUNMOUNT. + */ +int +fs_dounmount( + bhv_desc_t *bdp, + int flags, + vnode_t *rootvp, + cred_t *cr) +{ + struct vfs *vfsp = bhvtovfs(bdp); + bhv_desc_t *fbdp = vfsp->vfs_fbhv; + int error; + + /* + * Wait for sync to finish and lock vfsp. This also sets the + * VFS_OFFLINE flag. Once we do this we can give up reference + * the root vnode which we hold to avoid the another unmount + * ripping the vfs out from under us before we get to lock it. + * The VFS_DOUNMOUNT calling convention is that the reference + * on the rot vnode is released whether the call succeeds or + * fails. + */ + if (rootvp) + VN_RELE(rootvp); + + /* + * Now invoke SYNC and UNMOUNT ops, using the PVFS versions is + * OK since we already have a behavior lock as a result of + * being in VFS_DOUNMOUNT. It is necessary to do things this + * way since using the VFS versions would cause us to get the + * behavior lock twice which can cause deadlock as well as + * making the coding of vfs relocation unnecessarilty difficult + * by making relocations invoked by unmount occur in a different + * environment than those invoked by mount-update. + */ + PVFS_SYNC(fbdp, SYNC_ATTR|SYNC_DELWRI, cr, error); + if (error == 0) + PVFS_UNMOUNT(fbdp, flags, cr, error); + return error; +} + +/* + * Stub for no-op vnode operations that return error status. + */ +int +fs_noerr() +{ + return 0; +} + +/* + * Operation unsupported under this file system. + */ +int +fs_nosys() +{ + return ENOSYS; +} + +/* + * Stub for inactive, strategy, and read/write lock/unlock. Does nothing. + */ +/* ARGSUSED */ +void +fs_noval() +{ +} + +/* + * vnode pcache layer for vnode_tosspages. + * 'last' parameter unused but left in for IRIX compatibility + */ +void +fs_tosspages( + bhv_desc_t *bdp, + xfs_off_t first, + xfs_off_t last, + int fiopt) +{ + vnode_t *vp = BHV_TO_VNODE(bdp); + struct inode *ip = LINVFS_GET_IP(vp); + + if (VN_CACHED(vp)) + truncate_inode_pages(ip->i_mapping, first); +} + + +/* + * vnode pcache layer for vnode_flushinval_pages. + * 'last' parameter unused but left in for IRIX compatibility + */ +void +fs_flushinval_pages( + bhv_desc_t *bdp, + xfs_off_t first, + xfs_off_t last, + int fiopt) +{ + vnode_t *vp = BHV_TO_VNODE(bdp); + struct inode *ip = LINVFS_GET_IP(vp); + + if (VN_CACHED(vp)) { + filemap_fdatasync(ip->i_mapping); + fsync_inode_data_buffers(ip); + filemap_fdatawait(ip->i_mapping); + + truncate_inode_pages(ip->i_mapping, first); + } +} + +/* + * vnode pcache layer for vnode_flush_pages. + * 'last' parameter unused but left in for IRIX compatibility + */ +int +fs_flush_pages( + bhv_desc_t *bdp, + xfs_off_t first, + xfs_off_t last, + uint64_t flags, + int fiopt) +{ + vnode_t *vp = BHV_TO_VNODE(bdp); + struct inode *ip = LINVFS_GET_IP(vp); + + if (VN_CACHED(vp)) { + filemap_fdatasync(ip->i_mapping); + fsync_inode_data_buffers(ip); + filemap_fdatawait(ip->i_mapping); + } + + return 0; +} + diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_fs_subr.h linux-2.4-xfs/fs/xfs/linux/xfs_fs_subr.h --- linux-2.4.19/fs/xfs/linux/xfs_fs_subr.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_fs_subr.h Wed Aug 28 23:20:07 2002 @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2000, 2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUBR_H__ +#define __XFS_SUBR_H__ + +/* + * Utilities shared among file system implementations. + */ + +struct cred; + +extern int fs_noerr(void); +extern int fs_nosys(void); +extern int fs_nodev(void); +extern void fs_noval(void); +extern int fs_dounmount(bhv_desc_t *, int, vnode_t *, struct cred *); +extern void fs_tosspages(bhv_desc_t *, xfs_off_t, xfs_off_t, int); +extern void fs_flushinval_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, int); +extern int fs_flush_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, uint64_t, int); + +#endif /* __XFS_FS_SUBR_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_globals.c linux-2.4-xfs/fs/xfs/linux/xfs_globals.c --- linux-2.4.19/fs/xfs/linux/xfs_globals.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_globals.c Wed Aug 28 23:20:07 2002 @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * This file contains globals needed by XFS that were normally defined + * somewhere else in IRIX. + */ + +#include + +uint64_t xfs_panic_mask; /* set to cause more panics */ +unsigned long xfs_physmem; + +/* + * restricted_chown = 1 bsd style chown(2), only super-user can give away files + * restricted_chown = 0 sysV style chown(2), non super-user can give away files + */ +int restricted_chown = 1; + +/* + * Used to serialize atomicIncWithWrap. + */ +spinlock_t Atomic_spin = SPIN_LOCK_UNLOCKED; + +/* + * Global system credential structure. + */ +cred_t sys_cred_val, *sys_cred = &sys_cred_val; + +/* + * The global quota manager. There is only one of these for the entire + * system, _not_ one per file system. XQM keeps track of the overall + * quota functionality, including maintaining the freelist and hash + * tables of dquots. + */ +struct xfs_qm *xfs_Gqm; +mutex_t xfs_Gqm_lock; + +/* Export XFS symbols used by xfsidbg */ +EXPORT_SYMBOL(xfs_Gqm); +EXPORT_SYMBOL(xfs_next_bit); +EXPORT_SYMBOL(xfs_contig_bits); diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_globals.h linux-2.4-xfs/fs/xfs/linux/xfs_globals.h --- linux-2.4.19/fs/xfs/linux/xfs_globals.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_globals.h Wed Aug 28 23:20:07 2002 @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_GLOBALS_H__ +#define __XFS_GLOBALS_H__ + +/* + * This file declares globals needed by XFS that were normally defined + * somewhere else in IRIX. + */ + +extern uint64_t xfs_panic_mask; /* set to cause more panics */ + +extern int restricted_chown; +extern unsigned long xfs_physmem; + +extern struct cred *sys_cred; + +extern struct xfs_qm *xfs_Gqm; +extern mutex_t xfs_Gqm_lock; + +#endif /* __XFS_GLOBALS_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_ioctl.c linux-2.4-xfs/fs/xfs/linux/xfs_ioctl.c --- linux-2.4.19/fs/xfs/linux/xfs_ioctl.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_ioctl.c Thu Sep 5 15:35:08 2002 @@ -0,0 +1,1094 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include +#include +#include +#include +#include + + +extern int xfs_change_file_space(bhv_desc_t *, int, + xfs_flock64_t *, xfs_off_t, cred_t *, int); +extern int xfs_set_dmattrs(bhv_desc_t *, u_int, u_int16_t, cred_t *); + + +/* + * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to + * a file or fs handle. + * + * XFS_IOC_PATH_TO_FSHANDLE + * returns fs handle for a mount point or path within that mount point + * XFS_IOC_FD_TO_HANDLE + * returns full handle for a FD opened in user space + * XFS_IOC_PATH_TO_HANDLE + * returns full handle for a path + */ +STATIC int +xfs_find_handle( + unsigned int cmd, + unsigned long arg) +{ + int hsize; + xfs_handle_t handle; + xfs_fsop_handlereq_t hreq; + struct inode *inode; + struct vnode *vp; + + if (copy_from_user(&hreq, (xfs_fsop_handlereq_t *)arg, sizeof(hreq))) + return -XFS_ERROR(EFAULT); + + bzero((char *)&handle, sizeof(handle)); + + switch (cmd) { + case XFS_IOC_PATH_TO_FSHANDLE: + case XFS_IOC_PATH_TO_HANDLE: { + struct nameidata nd; + char *path; + int error; + + /* we need the path */ + path = getname(hreq.path); + if (IS_ERR(path)) + return PTR_ERR(path); + + /* traverse the path */ + error = 0; + if (path_init(path, LOOKUP_POSITIVE, &nd)) + error = path_walk(path, &nd); + putname(path); + if (error) + return error; + + ASSERT(nd.dentry); + ASSERT(nd.dentry->d_inode); + inode = igrab(nd.dentry->d_inode); + path_release(&nd); + break; + } + + case XFS_IOC_FD_TO_HANDLE: { + struct file *file; + + file = fget(hreq.fd); + if (!file) + return -EBADF; + + ASSERT(file->f_dentry); + ASSERT(file->f_dentry->d_inode); + inode = igrab(file->f_dentry->d_inode); + fput(file); + + break; + } + + default: + ASSERT(0); + return -XFS_ERROR(EINVAL); + } + + if (inode->i_sb->s_magic != XFS_SB_MAGIC) { + /* we're not in XFS anymore, Toto */ + iput(inode); + return -XFS_ERROR(EINVAL); + } + + /* we need the vnode */ + vp = LINVFS_GET_VP(inode); + if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) { + iput(inode); + return -XFS_ERROR(EBADF); + } + + /* now we can grab the fsid */ + memcpy(&handle.ha_fsid, vp->v_vfsp->vfs_altfsid, sizeof(xfs_fsid_t)); + hsize = sizeof(xfs_fsid_t); + + if (cmd != XFS_IOC_PATH_TO_FSHANDLE) { + xfs_inode_t *ip; + bhv_desc_t *bhv; + int lock_mode; + + /* need to get access to the xfs_inode to read the generation */ + VN_BHV_READ_LOCK(&(vp)->v_bh); + bhv = VNODE_TO_FIRST_BHV(vp); + ASSERT(bhv); + ip = XFS_BHVTOI(bhv); + ASSERT(ip); + lock_mode = xfs_ilock_map_shared(ip); + + /* fill in fid section of handle from inode */ + handle.ha_fid.xfs_fid_len = sizeof(xfs_fid_t) - + sizeof(handle.ha_fid.xfs_fid_len); + handle.ha_fid.xfs_fid_pad = 0; + handle.ha_fid.xfs_fid_gen = ip->i_d.di_gen; + handle.ha_fid.xfs_fid_ino = ip->i_ino; + + xfs_iunlock_map_shared(ip, lock_mode); + VN_BHV_READ_UNLOCK(&(vp)->v_bh); + + hsize = XFS_HSIZE(handle); + } + + /* now copy our handle into the user buffer & write out the size */ + if (copy_to_user((xfs_handle_t *)hreq.ohandle, &handle, hsize) || + copy_to_user(hreq.ohandlen, &hsize, sizeof(__s32))) { + iput(inode); + return -XFS_ERROR(EFAULT); + } + + iput(inode); + return 0; +} + + +/* + * Convert userspace handle data into vnode (and inode). + * We [ab]use the fact that all the fsop_handlereq ioctl calls + * have a data structure argument whose first component is always + * a xfs_fsop_handlereq_t, so we can cast to and from this type. + * This allows us to optimise the copy_from_user calls and gives + * a handy, shared routine. + * + * If no error, caller must always VN_RELE the returned vp. + */ +STATIC int +xfs_vget_fsop_handlereq( + xfs_mount_t *mp, + struct inode *parinode, /* parent inode pointer */ + int cap, /* capability level for op */ + unsigned long arg, /* userspace data pointer */ + unsigned long size, /* size of expected struct */ + /* output arguments */ + xfs_fsop_handlereq_t *hreq, + vnode_t **vp, + struct inode **inode) +{ + void *hanp; + size_t hlen; + xfs_fid_t *xfid; + xfs_handle_t *handlep; + xfs_handle_t handle; + xfs_inode_t *ip; + struct inode *inodep; + vnode_t *vpp; + __u32 igen; + ino_t ino; + int error; + + if (!capable(cap)) + return XFS_ERROR(EPERM); + + /* + * Only allow handle opens under a directory. + */ + if (!S_ISDIR(parinode->i_mode)) + return XFS_ERROR(ENOTDIR); + + /* + * Copy the handle down from the user and validate + * that it looks to be in the correct format. + */ + if (copy_from_user(hreq, (struct xfs_fsop_handlereq *)arg, size)) + return XFS_ERROR(EFAULT); + + hanp = hreq->ihandle; + hlen = hreq->ihandlen; + handlep = &handle; + + if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep)) + return XFS_ERROR(EINVAL); + if (copy_from_user(handlep, hanp, hlen)) + return XFS_ERROR(EFAULT); + if (hlen < sizeof(*handlep)) + bzero(((char *)handlep) + hlen, sizeof(*handlep) - hlen); + if (hlen > sizeof(handlep->ha_fsid)) { + if (handlep->ha_fid.xfs_fid_len != + (hlen - sizeof(handlep->ha_fsid) + - sizeof(handlep->ha_fid.xfs_fid_len)) + || handlep->ha_fid.xfs_fid_pad) + return XFS_ERROR(EINVAL); + } + + /* + * Crack the handle, obtain the inode # & generation # + */ + xfid = (struct xfs_fid *)&handlep->ha_fid; + if (xfid->xfs_fid_len == sizeof(*xfid) - sizeof(xfid->xfs_fid_len)) { + ino = xfid->xfs_fid_ino; + igen = xfid->xfs_fid_gen; + } else { + return XFS_ERROR(EINVAL); + } + + /* + * Get the XFS inode, building a vnode to go with it. + */ + error = xfs_iget(mp, NULL, ino, XFS_ILOCK_SHARED, &ip, 0); + if (error) + return error; + if (ip == NULL) + return XFS_ERROR(EIO); + if (ip->i_d.di_mode == 0 || ip->i_d.di_gen != igen) { + xfs_iput_new(ip, XFS_ILOCK_SHARED); + return XFS_ERROR(ENOENT); + } + + vpp = XFS_ITOV(ip); + inodep = LINVFS_GET_IP(vpp); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + error = linvfs_revalidate_core(inodep, ATTR_COMM); + if (error) { + iput(inodep); + /* this error is (-) but our callers expect + */ + return XFS_ERROR(-error); + } + + *vp = vpp; + *inode = inodep; + return 0; +} + +STATIC int +xfs_open_by_handle( + xfs_mount_t *mp, + unsigned long arg, + struct file *parfilp, + struct inode *parinode) +{ + int error; + int new_fd; + int permflag; + struct file *filp; + struct inode *inode; + struct dentry *dentry; + vnode_t *vp; + xfs_fsop_handlereq_t hreq; + struct list_head *lp; + + error = xfs_vget_fsop_handlereq(mp, parinode, CAP_SYS_ADMIN, arg, + sizeof(xfs_fsop_handlereq_t), + &hreq, &vp, &inode); + if (error) + return -error; + + /* Restrict xfs_open_by_handle to directories & regular files. */ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { + iput(inode); + return -XFS_ERROR(EINVAL); + } + +#if BITS_PER_LONG != 32 + hreq.oflags |= O_LARGEFILE; +#endif + /* Put open permission in namei format. */ + permflag = hreq.oflags; + if ((permflag+1) & O_ACCMODE) + permflag++; + if (permflag & O_TRUNC) + permflag |= 2; + + /* Can't write directories. */ + if ( S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) { + iput(inode); + return -XFS_ERROR(EISDIR); + } + + if ((new_fd = get_unused_fd()) < 0) { + iput(inode); + return new_fd; + } + + /* Now to find a dentry. If possible, get a well-connected one. */ + spin_lock(&dcache_lock); + for (lp = inode->i_dentry.next; lp != &inode->i_dentry ; lp=lp->next) { + dentry = list_entry(lp, struct dentry, d_alias); + if (! (dentry->d_flags & DCACHE_NFSD_DISCONNECTED)) { + dget_locked(dentry); + dentry->d_vfs_flags |= DCACHE_REFERENCED; + spin_unlock(&dcache_lock); + iput(inode); + goto found; + } + } + spin_unlock(&dcache_lock); + + /* ELSE didn't find dentry. Create anonymous dcache entry. */ + dentry = d_alloc_root(inode); + if (dentry == NULL) { + iput(inode); + put_unused_fd(new_fd); + return -XFS_ERROR(ENOMEM); + } + + /* Keep nfsd happy. */ + dentry->d_flags |= DCACHE_NFSD_DISCONNECTED; + + found: + /* Ensure umount returns EBUSY on umounts while this file is open. */ + mntget(parfilp->f_vfsmnt); + + /* Create file pointer. */ + filp = dentry_open(dentry, parfilp->f_vfsmnt, hreq.oflags); + if (IS_ERR(filp)) { + put_unused_fd(new_fd); + return -XFS_ERROR(-PTR_ERR(filp)); + } + filp->f_mode |= FINVIS; + + fd_install(new_fd, filp); + return new_fd; +} + +STATIC int +xfs_readlink_by_handle( + xfs_mount_t *mp, + unsigned long arg, + struct file *parfilp, + struct inode *parinode) +{ + int error; + struct iovec aiov; + struct uio auio; + struct inode *inode; + xfs_fsop_handlereq_t hreq; + vnode_t *vp; + __u32 olen; + + error = xfs_vget_fsop_handlereq(mp, parinode, CAP_SYS_ADMIN, arg, + sizeof(xfs_fsop_handlereq_t), + &hreq, &vp, &inode); + if (error) + return -error; + + /* Restrict this handle operation to symlinks only. */ + if (vp->v_type != VLNK) { + VN_RELE(vp); + return -XFS_ERROR(EINVAL); + } + + if (copy_from_user(&olen, hreq.ohandlen, sizeof(__u32))) { + VN_RELE(vp); + return -XFS_ERROR(EFAULT); + } + aiov.iov_len = olen; + aiov.iov_base = hreq.ohandle; + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_fmode = FINVIS; + auio.uio_offset = 0; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_resid = olen; + + VOP_READLINK(vp, &auio, NULL, error); + + VN_RELE(vp); + return (olen - auio.uio_resid); +} + +STATIC int +xfs_fssetdm_by_handle( + xfs_mount_t *mp, + unsigned long arg, + struct file *parfilp, + struct inode *parinode) +{ + int error; + struct fsdmidata fsd; + xfs_fsop_setdm_handlereq_t dmhreq; + struct inode *inode; + bhv_desc_t *bdp; + vnode_t *vp; + + error = xfs_vget_fsop_handlereq(mp, parinode, CAP_MKNOD, arg, + sizeof(xfs_fsop_setdm_handlereq_t), + (xfs_fsop_handlereq_t *)&dmhreq, + &vp, &inode); + if (error) + return -error; + + if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) { + VN_RELE(vp); + return -XFS_ERROR(EFAULT); + } + + bdp = bhv_base_unlocked(VN_BHV_HEAD(vp)); + error = xfs_set_dmattrs(bdp, fsd.fsd_dmevmask, fsd.fsd_dmstate, NULL); + + VN_RELE(vp); + if (error) + return -error; + return 0; +} + +STATIC int +xfs_attrlist_by_handle( + xfs_mount_t *mp, + unsigned long arg, + struct file *parfilp, + struct inode *parinode) +{ + int error; + attrlist_cursor_kern_t *cursor; + xfs_fsop_attrlist_handlereq_t al_hreq; + struct inode *inode; + vnode_t *vp; + + error = xfs_vget_fsop_handlereq(mp, parinode, CAP_SYS_ADMIN, arg, + sizeof(xfs_fsop_attrlist_handlereq_t), + (xfs_fsop_handlereq_t *)&al_hreq, + &vp, &inode); + if (error) + return -error; + + cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; + VOP_ATTR_LIST(vp, al_hreq.buffer, al_hreq.buflen, al_hreq.flags, + cursor, NULL, error); + VN_RELE(vp); + if (error) + return -error; + return 0; +} + +STATIC int +xfs_attrmulti_by_handle( + xfs_mount_t *mp, + unsigned long arg, + struct file *parfilp, + struct inode *parinode) +{ + int error; + xfs_attr_multiop_t *ops; + xfs_fsop_attrmulti_handlereq_t am_hreq; + struct inode *inode; + vnode_t *vp; + int i, size; + + error = xfs_vget_fsop_handlereq(mp, parinode, CAP_SYS_ADMIN, arg, + sizeof(xfs_fsop_attrmulti_handlereq_t), + (xfs_fsop_handlereq_t *)&am_hreq, + &vp, &inode); + if (error) + return -error; + + size = am_hreq.opcount * sizeof(attr_multiop_t); + ops = (xfs_attr_multiop_t *)kmalloc(size, GFP_KERNEL); + if (!ops) { + VN_RELE(vp); + return -XFS_ERROR(ENOMEM); + } + + if (copy_from_user(ops, am_hreq.ops, size)) { + kfree(ops); + VN_RELE(vp); + return -XFS_ERROR(EFAULT); + } + + for (i = 0; i < am_hreq.opcount; i++) { + switch(ops[i].am_opcode) { + case ATTR_OP_GET: + VOP_ATTR_GET(vp,ops[i].am_attrname, ops[i].am_attrvalue, + &ops[i].am_length, ops[i].am_flags, + NULL, ops[i].am_error); + break; + case ATTR_OP_SET: + VOP_ATTR_SET(vp,ops[i].am_attrname, ops[i].am_attrvalue, + ops[i].am_length, ops[i].am_flags, + NULL, ops[i].am_error); + break; + case ATTR_OP_REMOVE: + VOP_ATTR_REMOVE(vp, ops[i].am_attrname, ops[i].am_flags, + NULL, ops[i].am_error); + break; + default: + ops[i].am_error = EINVAL; + } + } + + if (copy_to_user(am_hreq.ops, ops, size)) + error = -XFS_ERROR(EFAULT); + + kfree(ops); + VN_RELE(vp); + return error; +} + +/* prototypes for a few of the stack-hungry cases that have + * their own functions. Functions are defined after their use + * so gcc doesn't get fancy and inline them with -03 */ + +int xfs_ioc_space( + bhv_desc_t *bdp, + vnode_t *vp, + struct file *filp, + unsigned int cmd, + unsigned long arg); + +int xfs_ioc_bulkstat( + xfs_mount_t *mp, + unsigned int cmd, + unsigned long arg); + +int xfs_ioc_fsgeometry_v1( + xfs_mount_t *mp, + unsigned long arg); + +int xfs_ioc_fsgeometry( + xfs_mount_t *mp, + unsigned long arg); + +int xfs_ioc_xattr( + vnode_t *vp, + struct file *filp, + unsigned int cmd, + unsigned long arg); + +int xfs_ioc_getbmap( + bhv_desc_t *bdp, + struct file *filp, + unsigned int cmd, + unsigned long arg); + +int xfs_ioc_getbmapx( + bhv_desc_t *bdp, + unsigned long arg); + +int +xfs_ioctl( + bhv_desc_t *bdp, + struct inode *inode, + struct file *filp, + unsigned int cmd, + unsigned long arg) +{ + int error; + vnode_t *vp; + xfs_inode_t *ip; + xfs_mount_t *mp; + + vp = LINVFS_GET_VP(inode); + + vn_trace_entry(vp, "xfs_ioctl", (inst_t *)__return_address); + + ip = XFS_BHVTOI(bdp); + mp = ip->i_mount; + + switch (cmd) { + + case XFS_IOC_ALLOCSP: + case XFS_IOC_FREESP: + case XFS_IOC_RESVSP: + case XFS_IOC_UNRESVSP: + case XFS_IOC_ALLOCSP64: + case XFS_IOC_FREESP64: + case XFS_IOC_RESVSP64: + case XFS_IOC_UNRESVSP64: + return xfs_ioc_space(bdp, vp, filp, cmd, arg); + + case XFS_IOC_DIOINFO: { + struct dioattr da; + + da.d_miniosz = mp->m_sb.sb_blocksize; + da.d_mem = mp->m_sb.sb_blocksize; + + /* + * this only really needs to be BBSIZE. + * it is set to the file system block size to + * avoid having to do block zeroing on short writes. + */ + da.d_maxiosz = XFS_FSB_TO_B(mp, + XFS_B_TO_FSBT(mp, KIO_MAX_ATOMIC_IO << 10)); + + if (copy_to_user((struct dioattr *)arg, &da, sizeof(da))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_FSBULKSTAT_SINGLE: + case XFS_IOC_FSBULKSTAT: + case XFS_IOC_FSINUMBERS: + return xfs_ioc_bulkstat(mp, cmd, arg); + + case XFS_IOC_FSGEOMETRY_V1: + return xfs_ioc_fsgeometry_v1(mp, arg); + + case XFS_IOC_FSGEOMETRY: + return xfs_ioc_fsgeometry(mp, arg); + + case XFS_IOC_FSGETXATTR: + case XFS_IOC_FSSETXATTR: + case XFS_IOC_FSGETXATTRA: + return xfs_ioc_xattr(vp, filp, cmd, arg); + + case XFS_IOC_FSSETDM: { + struct fsdmidata dmi; + + if (copy_from_user(&dmi, (struct fsdmidata *)arg, sizeof(dmi))) + return -XFS_ERROR(EFAULT); + + error = xfs_set_dmattrs(bdp, dmi.fsd_dmevmask, dmi.fsd_dmstate, + NULL); + if (error) + return -error; + return 0; + } + + case XFS_IOC_GETBMAP: + case XFS_IOC_GETBMAPA: + return xfs_ioc_getbmap(bdp, filp, cmd, arg); + + case XFS_IOC_GETBMAPX: + return xfs_ioc_getbmapx(bdp, arg); + + case XFS_IOC_FD_TO_HANDLE: + case XFS_IOC_PATH_TO_HANDLE: + case XFS_IOC_PATH_TO_FSHANDLE: + return xfs_find_handle(cmd, arg); + + case XFS_IOC_OPEN_BY_HANDLE: + return xfs_open_by_handle(mp, arg, filp, inode); + + case XFS_IOC_FSSETDM_BY_HANDLE: + return xfs_fssetdm_by_handle(mp, arg, filp, inode); + + case XFS_IOC_READLINK_BY_HANDLE: + return xfs_readlink_by_handle(mp, arg, filp, inode); + + case XFS_IOC_ATTRLIST_BY_HANDLE: + return xfs_attrlist_by_handle(mp, arg, filp, inode); + + case XFS_IOC_ATTRMULTI_BY_HANDLE: + return xfs_attrmulti_by_handle(mp, arg, filp, inode); + + case XFS_IOC_SWAPEXT: { + error = xfs_swapext((struct xfs_swapext *)arg); + if (error) + return -error; + return 0; + } + + case XFS_IOC_FSCOUNTS: { + xfs_fsop_counts_t out; + + error = xfs_fs_counts(mp, &out); + if (error) + return -error; + + if (copy_to_user((char *)arg, &out, sizeof(out))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_SET_RESBLKS: { + xfs_fsop_resblks_t inout; + __uint64_t in; + + /* Only allow the sys admin to reserve space unless + * unwritten extents are enabled. + */ + if (!XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&inout, (char *)arg, sizeof(inout))) + return -XFS_ERROR(EFAULT); + + /* input parameter is passed in resblks field of structure */ + in = inout.resblks; + error = xfs_reserve_blocks(mp, &in, &inout); + + if (copy_to_user((char *)arg, &inout, sizeof(inout))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_GET_RESBLKS: { + xfs_fsop_resblks_t out; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + error = xfs_reserve_blocks(mp, NULL, &out); + if (error) + return -error; + + if (copy_to_user((char *)arg, &out, sizeof(out))) + return -XFS_ERROR(EFAULT); + + return 0; + } + + case XFS_IOC_FSGROWFSDATA: { + xfs_growfs_data_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&in, (char *)arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = xfs_growfs_data(mp, &in); + if (error) + return -error; + return 0; + } + + case XFS_IOC_FSGROWFSLOG: { + xfs_growfs_log_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&in, (char *)arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = xfs_growfs_log(mp, &in); + if (error) + return -error; + return 0; + } + + case XFS_IOC_FSGROWFSRT: { + xfs_growfs_rt_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&in, (char *)arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = xfs_growfs_rt(mp, &in); + if (error) + return -error; + return 0; + } + + case XFS_IOC_FREEZE: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + xfs_fs_freeze(mp); + return 0; + + case XFS_IOC_THAW: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + xfs_fs_thaw(mp); + return 0; + + case XFS_IOC_ERROR_INJECTION: { + xfs_error_injection_t in; + + if (copy_from_user(&in, (char *)arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = xfs_errortag_add(in.errtag, mp); + if (error) + return -error; + return 0; + } + + case XFS_IOC_ERROR_CLEARALL: + error = xfs_errortag_clearall(mp); + return -error; + + default: + return -ENOTTY; + } +} + +int xfs_ioc_space( + bhv_desc_t *bdp, + vnode_t *vp, + struct file *filp, + unsigned int cmd, + unsigned long arg) +{ + xfs_flock64_t bf; + int attr_flags = 0; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + + if (filp->f_flags & O_RDONLY) + return -XFS_ERROR(EBADF); + + if (vp->v_type != VREG) + return -XFS_ERROR(EINVAL); + + if (copy_from_user(&bf, (xfs_flock64_t *)arg, sizeof(bf))) + return -XFS_ERROR(EFAULT); + + if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) + attr_flags |= ATTR_NONBLOCK; + if (filp->f_mode & FINVIS) + attr_flags |= ATTR_DMI; + + error = xfs_change_file_space(bdp, cmd, &bf, filp->f_pos, + NULL, attr_flags); + return -error; +} + +int xfs_ioc_bulkstat( + xfs_mount_t *mp, + unsigned int cmd, + unsigned long arg) +{ + xfs_fsop_bulkreq_t bulkreq; + int count; /* # of records returned */ + xfs_ino_t inlast; /* last inode number */ + int done; + int error; + /* done = 1 if there are more stats to get and if bulkstat */ + /* should be called again (unused here, but used in dmapi) */ + + /* Do not allow space reservation if this is not the admin and + * unwritten extents are turned off. + */ + if (!XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); + + if (copy_from_user(&bulkreq, (xfs_fsop_bulkreq_t *)arg, + sizeof(xfs_fsop_bulkreq_t))) + return -XFS_ERROR(EFAULT); + + if (copy_from_user(&inlast, (__s64 *)bulkreq.lastip, + sizeof(__s64))) + return -XFS_ERROR(EFAULT); + + if ((count = bulkreq.icount) <= 0) + return -XFS_ERROR(EINVAL); + + if (cmd == XFS_IOC_FSINUMBERS) + error = xfs_inumbers(mp, NULL, &inlast, &count, + bulkreq.ubuffer); + else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE) + error = xfs_bulkstat_single(mp, &inlast, + bulkreq.ubuffer, &done); + else { /* XFS_IOC_FSBULKSTAT */ + if (count == 1 && inlast != 0) { + inlast++; + error = xfs_bulkstat_single(mp, &inlast, + bulkreq.ubuffer, &done); + } else { + error = xfs_bulkstat(mp, NULL, &inlast, &count, + (bulkstat_one_pf)xfs_bulkstat_one, + sizeof(xfs_bstat_t), bulkreq.ubuffer, + BULKSTAT_FG_QUICK, &done); + } + } + + if (error) + return -error; + + if (bulkreq.ocount != NULL) { + if (copy_to_user((xfs_ino_t *)bulkreq.lastip, &inlast, + sizeof(xfs_ino_t))) + return -XFS_ERROR(EFAULT); + + if (copy_to_user((__s32 *)bulkreq.ocount, &count, + sizeof(count))) + return -XFS_ERROR(EFAULT); + } + + return 0; +} + +int xfs_ioc_fsgeometry_v1( + xfs_mount_t *mp, + unsigned long arg) +{ + xfs_fsop_geom_v1_t fsgeo; + int error; + + error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3); + if (error) + return -error; + + if (copy_to_user((xfs_fsop_geom_t *)arg, &fsgeo, sizeof(fsgeo))) + return -XFS_ERROR(EFAULT); + return 0; +} + +int xfs_ioc_fsgeometry( + xfs_mount_t *mp, + unsigned long arg) +{ + xfs_fsop_geom_t fsgeo; + int error; + + error = xfs_fs_geometry(mp, &fsgeo, 4); + if (error) + return -error; + + if (copy_to_user((xfs_fsop_geom_t *)arg, &fsgeo, sizeof(fsgeo))) + return -XFS_ERROR(EFAULT); + return 0; +} + +int xfs_ioc_xattr( + vnode_t *vp, + struct file *filp, + unsigned int cmd, + unsigned long arg) +{ + struct fsxattr fa; + vattr_t va; + int error; + + switch (cmd) { + case XFS_IOC_FSGETXATTR: { + va.va_mask = AT_XFLAGS|AT_EXTSIZE|AT_NEXTENTS; + VOP_GETATTR(vp, &va, 0, NULL, error); + if (error) + return -error; + + fa.fsx_xflags = va.va_xflags; + fa.fsx_extsize = va.va_extsize; + fa.fsx_nextents = va.va_nextents; + + if (copy_to_user((struct fsxattr *)arg, &fa, sizeof(fa))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_FSSETXATTR: { + int attr_flags = 0; + + if (copy_from_user(&fa, (struct fsxattr *)arg, sizeof(fa))) + return -XFS_ERROR(EFAULT); + + va.va_mask = AT_XFLAGS | AT_EXTSIZE; + va.va_xflags = fa.fsx_xflags; + va.va_extsize = fa.fsx_extsize; + + if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) + attr_flags |= ATTR_NONBLOCK; + + VOP_SETATTR(vp, &va, attr_flags, NULL, error); + return -error; + } + + case XFS_IOC_FSGETXATTRA: { + + va.va_mask = AT_XFLAGS|AT_EXTSIZE|AT_ANEXTENTS; + VOP_GETATTR(vp, &va, 0, NULL, error); + if (error) + return -error; + + fa.fsx_xflags = va.va_xflags; + fa.fsx_extsize = va.va_extsize; + fa.fsx_nextents = va.va_anextents; + + if (copy_to_user((struct fsxattr *)arg, &fa, sizeof(fa))) + return -XFS_ERROR(EFAULT); + return 0; + } + + default: + return -ENOTTY; + + } +} + +int xfs_ioc_getbmap( + bhv_desc_t *bdp, + struct file *filp, + unsigned int cmd, + unsigned long arg) +{ + struct getbmap bm; + int iflags; + int error; + + if (copy_from_user(&bm, (struct getbmap *)arg, sizeof(bm))) + return -XFS_ERROR(EFAULT); + + if (bm.bmv_count < 2) + return -XFS_ERROR(EINVAL); + + iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); + if (filp->f_mode & FINVIS) + iflags |= BMV_IF_NO_DMAPI_READ; + + error = xfs_getbmap(bdp, &bm, (struct getbmap *)arg+1, iflags); + if (error) + return -error; + + if (copy_to_user((struct getbmap *)arg, &bm, sizeof(bm))) + return -XFS_ERROR(EFAULT); + return 0; +} + +int xfs_ioc_getbmapx( + bhv_desc_t *bdp, + unsigned long arg) +{ + struct getbmapx bmx; + struct getbmap bm; + int iflags; + int error; + + if (copy_from_user(&bmx, (struct getbmapx *)arg, sizeof(bmx))) + return -XFS_ERROR(EFAULT); + + if (bmx.bmv_count < 2) + return -XFS_ERROR(EINVAL); + + /* + * Map input getbmapx structure to a getbmap + * structure for xfs_getbmap. + */ + GETBMAP_CONVERT(bmx, bm); + + iflags = bmx.bmv_iflags; + + if (iflags & (~BMV_IF_VALID)) + return -XFS_ERROR(EINVAL); + + iflags |= BMV_IF_EXTENDED; + + error = xfs_getbmap(bdp, &bm, (struct getbmapx *)arg+1, iflags); + if (error) + return -error; + + GETBMAP_CONVERT(bm, bmx); + + if (copy_to_user((struct getbmapx *)arg, &bmx, sizeof(bmx))) + return -XFS_ERROR(EFAULT); + + return 0; +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_iops.c linux-2.4-xfs/fs/xfs/linux/xfs_iops.c --- linux-2.4.19/fs/xfs/linux/xfs_iops.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_iops.c Thu Sep 5 15:35:08 2002 @@ -0,0 +1,871 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include +#include +#include + + +/* + * Pull the link count and size up from the xfs inode to the linux inode + */ +STATIC void +validate_fields( + struct inode *ip) +{ + vnode_t *vp = LINVFS_GET_VP(ip); + vattr_t va; + int error; + + va.va_mask = AT_NLINK|AT_SIZE; + VOP_GETATTR(vp, &va, ATTR_LAZY, NULL, error); + ip->i_nlink = va.va_nlink; + ip->i_size = va.va_size; + ip->i_blocks = va.va_nblocks; +} + +#ifdef CONFIG_FS_POSIX_ACL +/* + * Determine whether a process has a valid fs_struct (kernel daemons + * like knfsd don't have an fs_struct). + */ +STATIC int inline +has_fs_struct(struct task_struct *task) +{ + return (task->fs != init_task.fs); +} +#endif + +STATIC int +linvfs_mknod( + struct inode *dir, + struct dentry *dentry, + int mode, + int rdev) +{ + struct inode *ip; + vattr_t va; + vnode_t *vp = NULL, *dvp = LINVFS_GET_VP(dir); + xattr_exists_t test_default_acl = _ACL_DEFAULT_EXISTS; + int have_default_acl = 0; + int error = EINVAL; + + if (test_default_acl) + have_default_acl = test_default_acl(dvp); + +#ifdef CONFIG_FS_POSIX_ACL + /* + * Conditionally compiled so that the ACL base kernel changes can be + * split out into separate patches - remove this once MS_POSIXACL is + * accepted, or some other way to implement this exists. + */ + if (IS_POSIXACL(dir) && !have_default_acl && has_fs_struct(current)) + mode &= ~current->fs->umask; +#endif + + bzero(&va, sizeof(va)); + va.va_mask = AT_TYPE|AT_MODE; + va.va_type = IFTOVT(mode); + va.va_mode = mode; + + switch (mode & S_IFMT) { + case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: + va.va_rdev = rdev; + va.va_mask |= AT_RDEV; + /*FALLTHROUGH*/ + case S_IFREG: + VOP_CREATE(dvp, dentry, &va, &vp, NULL, error); + break; + case S_IFDIR: + VOP_MKDIR(dvp, dentry, &va, &vp, NULL, error); + break; + default: + error = EINVAL; + break; + } + + if (!error) { + ASSERT(vp); + ip = LINVFS_GET_IP(vp); + if (!ip) { + VN_RELE(vp); + return -ENOMEM; + } + + if (S_ISCHR(mode) || S_ISBLK(mode)) + ip->i_rdev = to_kdev_t(rdev); + /* linvfs_revalidate_core returns (-) errors */ + error = -linvfs_revalidate_core(ip, ATTR_COMM); + validate_fields(dir); + d_instantiate(dentry, ip); + mark_inode_dirty_sync(ip); + mark_inode_dirty_sync(dir); + } + + if (!error && have_default_acl) { + _ACL_DECL (pdacl); + + if (!_ACL_ALLOC(pdacl)) { + error = -ENOMEM; + } else { + if (_ACL_GET_DEFAULT(dvp, pdacl)) + error = _ACL_INHERIT(vp, &va, pdacl); + VMODIFY(vp); + _ACL_FREE(pdacl); + } + } + return -error; +} + +STATIC int +linvfs_create( + struct inode *dir, + struct dentry *dentry, + int mode) +{ + return linvfs_mknod(dir, dentry, mode, 0); +} + +STATIC int +linvfs_mkdir( + struct inode *dir, + struct dentry *dentry, + int mode) +{ + return linvfs_mknod(dir, dentry, mode|S_IFDIR, 0); +} + + +STATIC struct dentry * +linvfs_lookup( + struct inode *dir, + struct dentry *dentry) +{ + int error; + vnode_t *vp, *cvp; + struct inode *ip = NULL; + + if (dentry->d_name.len >= MAXNAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + cvp = NULL; + vp = LINVFS_GET_VP(dir); + VOP_LOOKUP(vp, dentry, &cvp, 0, NULL, NULL, error); + if (!error) { + ASSERT(cvp); + ip = LINVFS_GET_IP(cvp); + if (!ip) { + VN_RELE(cvp); + return ERR_PTR(-EACCES); + } + error = -linvfs_revalidate_core(ip, ATTR_COMM); + } + if (error && (error != ENOENT)) + return ERR_PTR(-error); + d_add(dentry, ip); /* Negative entry goes in if ip is NULL */ + return NULL; +} + +STATIC int +linvfs_link( + struct dentry *old_dentry, + struct inode *dir, + struct dentry *dentry) +{ + int error; + vnode_t *tdvp; /* Target directory for new name/link */ + vnode_t *vp; /* vp of name being linked */ + struct inode *ip; /* inode of guy being linked to */ + + ip = old_dentry->d_inode; /* inode being linked to */ + if (S_ISDIR(ip->i_mode)) + return -EPERM; + + tdvp = LINVFS_GET_VP(dir); + vp = LINVFS_GET_VP(ip); + + error = 0; + VOP_LINK(tdvp, vp, dentry, NULL, error); + if (!error) { + VMODIFY(tdvp); + VN_HOLD(vp); + validate_fields(ip); + d_instantiate(dentry, ip); + mark_inode_dirty_sync(ip); + } + return -error; +} + +STATIC int +linvfs_unlink( + struct inode *dir, + struct dentry *dentry) +{ + int error = 0; + struct inode *inode; + vnode_t *dvp; /* directory containing name to remove */ + + inode = dentry->d_inode; + + dvp = LINVFS_GET_VP(dir); + + VOP_REMOVE(dvp, dentry, NULL, error); + + if (!error) { + validate_fields(dir); /* For size only */ + validate_fields(inode); + mark_inode_dirty_sync(inode); + mark_inode_dirty_sync(dir); + } + + return -error; +} + +STATIC int +linvfs_symlink( + struct inode *dir, + struct dentry *dentry, + const char *symname) +{ + int error; + vnode_t *dvp; /* directory containing name to remove */ + vnode_t *cvp; /* used to lookup symlink to put in dentry */ + vattr_t va; + struct inode *ip = NULL; + + dvp = LINVFS_GET_VP(dir); + + bzero(&va, sizeof(va)); + va.va_type = VLNK; + va.va_mode = 0777 & ~current->fs->umask; + va.va_mask = AT_TYPE|AT_MODE; /* AT_PROJID? */ + + error = 0; + VOP_SYMLINK(dvp, dentry, &va, (char *)symname, + &cvp, NULL, error); + if (!error) { + ASSERT(cvp); + ASSERT(cvp->v_type == VLNK); + ip = LINVFS_GET_IP(cvp); + if (!ip) { + error = ENOMEM; + VN_RELE(cvp); + } else { + /* linvfs_revalidate_core returns (-) errors */ + error = -linvfs_revalidate_core(ip, ATTR_COMM); + d_instantiate(dentry, ip); + validate_fields(dir); + mark_inode_dirty_sync(ip); + mark_inode_dirty_sync(dir); + } + } + return -error; +} + +STATIC int +linvfs_rmdir( + struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + vnode_t *dvp = LINVFS_GET_VP(dir); + int error; + + VOP_RMDIR(dvp, dentry, NULL, error); + if (!error) { + validate_fields(inode); + validate_fields(dir); + mark_inode_dirty_sync(inode); + mark_inode_dirty_sync(dir); + } + return -error; +} + +STATIC int +linvfs_rename( + struct inode *odir, + struct dentry *odentry, + struct inode *ndir, + struct dentry *ndentry) +{ + int error; + vnode_t *fvp; /* from directory */ + vnode_t *tvp; /* target directory */ + struct inode *new_inode = NULL; + + fvp = LINVFS_GET_VP(odir); + tvp = LINVFS_GET_VP(ndir); + + new_inode = ndentry->d_inode; + + VOP_RENAME(fvp, odentry, tvp, ndentry, NULL, error); + if (error) + return -error; + + if (new_inode) { + validate_fields(new_inode); + } + + validate_fields(odir); + if (ndir != odir) + validate_fields(ndir); + mark_inode_dirty(ndir); + return 0; +} + +STATIC int +linvfs_readlink( + struct dentry *dentry, + char *buf, + int size) +{ + vnode_t *vp; + uio_t uio; + iovec_t iov; + int error; + + vp = LINVFS_GET_VP(dentry->d_inode); + + iov.iov_base = buf; + iov.iov_len = size; + + uio.uio_iov = &iov; + uio.uio_offset = 0; + uio.uio_segflg = UIO_USERSPACE; + uio.uio_resid = size; + + VOP_READLINK(vp, &uio, NULL, error); + if (error) + return -error; + + return (size - uio.uio_resid); +} + +/* + * careful here - this function can get called recusively, so + * we need to be very careful about how much stack we use. + * uio is kmalloced for this reason... + */ +STATIC int +linvfs_follow_link( + struct dentry *dentry, + struct nameidata *nd) +{ + vnode_t *vp; + uio_t *uio; + iovec_t iov; + int error; + char *link; + + ASSERT(dentry); + ASSERT(nd); + + link = (char *)kmalloc(MAXNAMELEN+1, GFP_KERNEL); + if (!link) + return -ENOMEM; + + uio = (uio_t *)kmalloc(sizeof(uio_t), GFP_KERNEL); + if (!uio) { + kfree(link); + return -ENOMEM; + } + + vp = LINVFS_GET_VP(dentry->d_inode); + + iov.iov_base = link; + iov.iov_len = MAXNAMELEN; + + uio->uio_iov = &iov; + uio->uio_offset = 0; + uio->uio_segflg = UIO_SYSSPACE; + uio->uio_resid = MAXNAMELEN; + uio->uio_fmode = 0; + + VOP_READLINK(vp, uio, NULL, error); + if (error) { + kfree(uio); + kfree(link); + return -error; + } + + link[MAXNAMELEN - uio->uio_resid] = '\0'; + kfree(uio); + + /* vfs_follow_link returns (-) errors */ + error = vfs_follow_link(nd, link); + kfree(link); + return error; +} + +STATIC int +linvfs_permission( + struct inode *inode, + int mode) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + int error; + + mode <<= 6; /* convert from linux to vnode access bits */ + VOP_ACCESS(vp, mode, NULL, error); + return -error; +} + +/* Brute force approach for now - copy data into linux inode + * from the results of a getattr. This gets called out of things + * like stat. + */ +int +linvfs_revalidate_core( + struct inode *inode, + int flags) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + + /* vn_revalidate returns (-) error so this is ok */ + return vn_revalidate(vp, flags); +} + +STATIC int +linvfs_revalidate( + struct dentry *dentry) +{ + vnode_t *vp = LINVFS_GET_VP(dentry->d_inode); + + if (unlikely(vp->v_flag & VMODIFIED)) { + return linvfs_revalidate_core(dentry->d_inode, 0); + } + return 0; +} + +STATIC int +linvfs_setattr( + struct dentry *dentry, + struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + vnode_t *vp = LINVFS_GET_VP(inode); + vattr_t vattr; + unsigned int ia_valid = attr->ia_valid; + int error; + int flags = 0; + + memset(&vattr, 0, sizeof(vattr_t)); + if (ia_valid & ATTR_UID) { + vattr.va_mask |= AT_UID; + vattr.va_uid = attr->ia_uid; + } + if (ia_valid & ATTR_GID) { + vattr.va_mask |= AT_GID; + vattr.va_gid = attr->ia_gid; + } + if (ia_valid & ATTR_SIZE) { + vattr.va_mask |= AT_SIZE; + vattr.va_size = attr->ia_size; + } + if (ia_valid & ATTR_ATIME) { + vattr.va_mask |= AT_ATIME; + vattr.va_atime.tv_sec = attr->ia_atime; + vattr.va_atime.tv_nsec = 0; + } + if (ia_valid & ATTR_MTIME) { + vattr.va_mask |= AT_MTIME; + vattr.va_mtime.tv_sec = attr->ia_mtime; + vattr.va_mtime.tv_nsec = 0; + } + if (ia_valid & ATTR_CTIME) { + vattr.va_mask |= AT_CTIME; + vattr.va_ctime.tv_sec = attr->ia_ctime; + vattr.va_ctime.tv_nsec = 0; + } + if (ia_valid & ATTR_MODE) { + vattr.va_mask |= AT_MODE; + vattr.va_mode = attr->ia_mode; + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + inode->i_mode &= ~S_ISGID; + } + + if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET)) + flags = ATTR_UTIME; + + VOP_SETATTR(vp, &vattr, flags, NULL, error); + if (error) + return(-error); /* Positive error up from XFS */ + if (ia_valid & ATTR_SIZE) { + error = vmtruncate(inode, attr->ia_size); + } + + if (!error) { + vn_revalidate(vp, 0); + mark_inode_dirty_sync(inode); + } + return error; +} + +STATIC void +linvfs_truncate( + struct inode *inode) +{ + block_truncate_page(inode->i_mapping, inode->i_size, linvfs_get_block); +} + + + +/* + * Extended attributes interfaces + */ + +#define SYSTEM_NAME "system." /* VFS shared names/values */ +#define ROOT_NAME "xfsroot." /* XFS ondisk names/values */ +#define USER_NAME "user." /* user's own names/values */ +STATIC xattr_namespace_t xfs_namespace_array[] = { + { .name= SYSTEM_NAME, .namelen= sizeof(SYSTEM_NAME)-1,.exists= NULL }, + { .name= ROOT_NAME, .namelen= sizeof(ROOT_NAME)-1, .exists= NULL }, + { .name= USER_NAME, .namelen= sizeof(USER_NAME)-1, .exists= NULL }, + { .name= NULL } +}; +xattr_namespace_t *xfs_namespaces = &xfs_namespace_array[0]; + +#define POSIXACL_ACCESS "posix_acl_access" +#define POSIXACL_ACCESS_SIZE (sizeof(POSIXACL_ACCESS)-1) +#define POSIXACL_DEFAULT "posix_acl_default" +#define POSIXACL_DEFAULT_SIZE (sizeof(POSIXACL_DEFAULT)-1) +#define POSIXCAP "posix_capabilities" +#define POSIXCAP_SIZE (sizeof(POSIXCAP)-1) +#define POSIXMAC "posix_mac" +#define POSIXMAC_SIZE (sizeof(POSIXMAC)-1) +STATIC xattr_namespace_t sys_namespace_array[] = { + { .name= POSIXACL_ACCESS, + .namelen= POSIXACL_ACCESS_SIZE, .exists= _ACL_ACCESS_EXISTS }, + { .name= POSIXACL_DEFAULT, + .namelen= POSIXACL_DEFAULT_SIZE, .exists= _ACL_DEFAULT_EXISTS }, + { .name= POSIXCAP, + .namelen= POSIXCAP_SIZE, .exists= _CAP_EXISTS }, + { .name= POSIXMAC, + .namelen= POSIXMAC_SIZE, .exists= _MAC_EXISTS }, + { .name= NULL } +}; + +/* + * Some checks to prevent people abusing EAs to get over quota: + * - Don't allow modifying user EAs on devices/symlinks; + * - Don't allow modifying user EAs if sticky bit set; + */ +STATIC int +capable_user_xattr( + struct inode *inode) +{ + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && + !capable(CAP_SYS_ADMIN)) + return 0; + if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && + (current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + return 0; + return 1; +} + +STATIC int +linvfs_setxattr( + struct dentry *dentry, + const char *name, + void *data, + size_t size, + int flags) +{ + int error; + int xflags = 0; + char *p = (char *)name; + struct inode *inode = dentry->d_inode; + vnode_t *vp = LINVFS_GET_VP(inode); + + if (strncmp(name, xfs_namespaces[SYSTEM_NAMES].name, + xfs_namespaces[SYSTEM_NAMES].namelen) == 0) { + error = -EINVAL; + if (flags & XATTR_CREATE) + return error; + error = -ENOATTR; + p += xfs_namespaces[SYSTEM_NAMES].namelen; + if (strcmp(p, POSIXACL_ACCESS) == 0) { + if (vp->v_flag & VMODIFIED) { + error = linvfs_revalidate_core(inode, 0); + if (error) + return error; + } + error = xfs_acl_vset(vp, data, size, _ACL_TYPE_ACCESS); + if (!error) { + VMODIFY(vp); + error = linvfs_revalidate_core(inode, 0); + } + } + else if (strcmp(p, POSIXACL_DEFAULT) == 0) { + error = linvfs_revalidate_core(inode, 0); + if (error) + return error; + error = xfs_acl_vset(vp, data, size, _ACL_TYPE_DEFAULT); + if (!error) { + VMODIFY(vp); + error = linvfs_revalidate_core(inode, 0); + } + } + else if (strcmp(p, POSIXCAP) == 0) { + error = xfs_cap_vset(vp, data, size); + } + return error; + } + + /* Convert Linux syscall to XFS internal ATTR flags */ + if (flags & XATTR_CREATE) + xflags |= ATTR_CREATE; + if (flags & XATTR_REPLACE) + xflags |= ATTR_REPLACE; + + if (strncmp(name, xfs_namespaces[ROOT_NAMES].name, + xfs_namespaces[ROOT_NAMES].namelen) == 0) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + xflags |= ATTR_ROOT; + p += xfs_namespaces[ROOT_NAMES].namelen; + VOP_ATTR_SET(vp, p, data, size, xflags, NULL, error); + return -error; + } + if (strncmp(name, xfs_namespaces[USER_NAMES].name, + xfs_namespaces[USER_NAMES].namelen) == 0) { + if (!capable_user_xattr(inode)) + return -EPERM; + p += xfs_namespaces[USER_NAMES].namelen; + VOP_ATTR_SET(vp, p, data, size, xflags, NULL, error); + return -error; + } + return -ENOATTR; +} + +STATIC ssize_t +linvfs_getxattr( + struct dentry *dentry, + const char *name, + void *data, + size_t size) +{ + ssize_t error; + int xflags = 0; + char *p = (char *)name; + struct inode *inode = dentry->d_inode; + vnode_t *vp = LINVFS_GET_VP(inode); + + if (strncmp(name, xfs_namespaces[SYSTEM_NAMES].name, + xfs_namespaces[SYSTEM_NAMES].namelen) == 0) { + error = -ENOATTR; + p += xfs_namespaces[SYSTEM_NAMES].namelen; + if (strcmp(p, POSIXACL_ACCESS) == 0) { + if (vp->v_flag & VMODIFIED) { + error = linvfs_revalidate_core(inode, 0); + if (error) + return error; + } + error = xfs_acl_vget(vp, data, size, _ACL_TYPE_ACCESS); + } + else if (strcmp(p, POSIXACL_DEFAULT) == 0) { + if (vp->v_flag & VMODIFIED) { + error = linvfs_revalidate_core(inode, 0); + if (error) + return error; + } + error = xfs_acl_vget(vp, data, size, _ACL_TYPE_DEFAULT); + } + else if (strcmp(p, POSIXCAP) == 0) { + error = xfs_cap_vget(vp, data, size); + } + return error; + } + + /* Convert Linux syscall to XFS internal ATTR flags */ + if (!size) + xflags |= ATTR_KERNOVAL; + + if (strncmp(name, xfs_namespaces[ROOT_NAMES].name, + xfs_namespaces[ROOT_NAMES].namelen) == 0) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + xflags |= ATTR_ROOT; + p += xfs_namespaces[ROOT_NAMES].namelen; + VOP_ATTR_GET(vp, p, data, (int *)&size, xflags, NULL, error); + if (!error) + error = -size; + return -error; + } + if (strncmp(name, xfs_namespaces[USER_NAMES].name, + xfs_namespaces[USER_NAMES].namelen) == 0) { + p += xfs_namespaces[USER_NAMES].namelen; + if (!capable_user_xattr(inode)) + return -EPERM; + VOP_ATTR_GET(vp, p, data, (int *)&size, xflags, NULL, error); + if (!error) + error = -size; + return -error; + } + return -ENOATTR; +} + + +STATIC ssize_t +linvfs_listxattr( + struct dentry *dentry, + char *data, + size_t size) +{ + ssize_t error; + int result = 0; + int xflags = ATTR_KERNAMELS; + char *k = data; + attrlist_cursor_kern_t cursor; + xattr_namespace_t *sys; + vnode_t *vp; + + vp = LINVFS_GET_VP(dentry->d_inode); + + if (!size) + xflags |= ATTR_KERNOVAL; + if (capable(CAP_SYS_ADMIN)) + xflags |= ATTR_KERNFULLS; + + memset(&cursor, 0, sizeof(cursor)); + VOP_ATTR_LIST(vp, data, size, xflags, &cursor, NULL, error); + if (error > 0) + return -error; + result += -error; + + k += result; /* advance start of our buffer */ + for (sys = &sys_namespace_array[0]; sys->name != NULL; sys++) { + if (sys->exists == NULL || !sys->exists(vp)) + continue; + result += xfs_namespaces[SYSTEM_NAMES].namelen; + result += sys->namelen + 1; + if (size) { + if (result > size) + return -ERANGE; + strcpy(k, xfs_namespaces[SYSTEM_NAMES].name); + k += xfs_namespaces[SYSTEM_NAMES].namelen; + strcpy(k, sys->name); + k += sys->namelen + 1; + } + } + return result; +} + +STATIC int +linvfs_removexattr( + struct dentry *dentry, + const char *name) +{ + int error; + int xflags = 0; + char *p = (char *)name; + struct inode *inode = dentry->d_inode; + vnode_t *vp = LINVFS_GET_VP(inode); + + if (strncmp(name, xfs_namespaces[SYSTEM_NAMES].name, + xfs_namespaces[SYSTEM_NAMES].namelen) == 0) { + error = -ENOATTR; + p += xfs_namespaces[SYSTEM_NAMES].namelen; + if (strcmp(p, POSIXACL_ACCESS) == 0) + error = xfs_acl_vremove(vp, _ACL_TYPE_ACCESS); + else if (strcmp(p, POSIXACL_DEFAULT) == 0) + error = xfs_acl_vremove(vp, _ACL_TYPE_DEFAULT); + else if (strcmp(p, POSIXCAP) == 0) + error = xfs_cap_vremove(vp); + return error; + } + + if (strncmp(name, xfs_namespaces[ROOT_NAMES].name, + xfs_namespaces[ROOT_NAMES].namelen) == 0) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + xflags |= ATTR_ROOT; + p += xfs_namespaces[ROOT_NAMES].namelen; + VOP_ATTR_REMOVE(vp, p, xflags, NULL, error); + return -error; + } + if (strncmp(name, xfs_namespaces[USER_NAMES].name, + xfs_namespaces[USER_NAMES].namelen) == 0) { + p += xfs_namespaces[USER_NAMES].namelen; + if (!capable_user_xattr(inode)) + return -EPERM; + VOP_ATTR_REMOVE(vp, p, xflags, NULL, error); + return -error; + } + return -ENOATTR; +} + + +struct inode_operations linvfs_file_inode_operations = +{ + .permission = linvfs_permission, + .truncate = linvfs_truncate, + .revalidate = linvfs_revalidate, + .setattr = linvfs_setattr, + .setxattr = linvfs_setxattr, + .getxattr = linvfs_getxattr, + .listxattr = linvfs_listxattr, + .removexattr = linvfs_removexattr, +}; + +struct inode_operations linvfs_dir_inode_operations = +{ + .create = linvfs_create, + .lookup = linvfs_lookup, + .link = linvfs_link, + .unlink = linvfs_unlink, + .symlink = linvfs_symlink, + .mkdir = linvfs_mkdir, + .rmdir = linvfs_rmdir, + .mknod = linvfs_mknod, + .rename = linvfs_rename, + .permission = linvfs_permission, + .revalidate = linvfs_revalidate, + .setattr = linvfs_setattr, + .setxattr = linvfs_setxattr, + .getxattr = linvfs_getxattr, + .listxattr = linvfs_listxattr, + .removexattr = linvfs_removexattr, +}; + +struct inode_operations linvfs_symlink_inode_operations = +{ + .readlink = linvfs_readlink, + .follow_link = linvfs_follow_link, + .permission = linvfs_permission, + .revalidate = linvfs_revalidate, + .setattr = linvfs_setattr, + .setxattr = linvfs_setxattr, + .getxattr = linvfs_getxattr, + .listxattr = linvfs_listxattr, + .removexattr = linvfs_removexattr, +}; diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_iops.h linux-2.4-xfs/fs/xfs/linux/xfs_iops.h --- linux-2.4.19/fs/xfs/linux/xfs_iops.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_iops.h Thu Sep 5 15:35:08 2002 @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_IOPS_H__ +#define __XFS_IOPS_H__ + +/* + * Extended system attributes. + * So far only POSIX ACLs are supported, but this will need to + * grow in time (capabilities, mandatory access control, etc). + */ +#define XFS_SYSTEM_NAMESPACE SYSTEM_POSIXACL + +/* + * Define a table of the namespaces XFS supports + */ +typedef int (*xattr_exists_t)(vnode_t *); + +typedef struct xattr_namespace { + char *name; + unsigned int namelen; + xattr_exists_t exists; +} xattr_namespace_t; + +#define SYSTEM_NAMES 0 +#define ROOT_NAMES 1 +#define USER_NAMES 2 +extern struct xattr_namespace *xfs_namespaces; + + +extern struct inode_operations linvfs_file_inode_operations; +extern struct inode_operations linvfs_dir_inode_operations; +extern struct inode_operations linvfs_symlink_inode_operations; + +extern struct file_operations linvfs_file_operations; +extern struct file_operations linvfs_dir_operations; + +extern struct address_space_operations linvfs_aops; + +extern int linvfs_revalidate_core(struct inode *, int); +extern int linvfs_get_block(struct inode *, long, struct buffer_head *, int); + +#endif /* __XFS_IOPS_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_linux.h linux-2.4-xfs/fs/xfs/linux/xfs_linux.h --- linux-2.4.19/fs/xfs/linux/xfs_linux.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_linux.h Thu Sep 5 15:35:08 2002 @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_LINUX__ +#define __XFS_LINUX__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifndef STATIC +#define STATIC static +#endif + +typedef struct xfs_dirent { /* data from readdir() */ + xfs_ino_t d_ino; /* inode number of entry */ + xfs_off_t d_off; /* offset of disk directory entry */ + unsigned short d_reclen; /* length of this record */ + char d_name[1]; /* name of file */ +} xfs_dirent_t; + +#define DIRENTBASESIZE (((xfs_dirent_t *)0)->d_name - (char *)0) +#define DIRENTSIZE(namelen) \ + ((DIRENTBASESIZE + (namelen) + \ + sizeof(xfs_off_t)) & ~(sizeof(xfs_off_t) - 1)) + +#define NBPP PAGE_SIZE +#define DPPSHFT (PAGE_SHIFT - 9) +#define NDPP (1 << (PAGE_SHIFT - 9)) +#define dtop(DD) (((DD) + NDPP - 1) >> DPPSHFT) +#define dtopt(DD) ((DD) >> DPPSHFT) +#define dpoff(DD) ((DD) & (NDPP-1)) + +#define NBBY 8 /* number of bits per byte */ +#define NBPC PAGE_SIZE /* Number of bytes per click */ +#define BPCSHIFT PAGE_SHIFT /* LOG2(NBPC) if exact */ + +/* + * Size of block device i/o is parameterized here. + * Currently the system supports page-sized i/o. + */ +#define BLKDEV_IOSHIFT BPCSHIFT +#define BLKDEV_IOSIZE (1<>BPCSHIFT) +#define btoct(x) ((__psunsigned_t)(x)>>BPCSHIFT) +#define btoc64(x) (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT) +#define btoct64(x) ((__uint64_t)(x)>>BPCSHIFT) +#define io_btoc(x) (((__psunsigned_t)(x)+(IO_NBPC-1))>>IO_BPCSHIFT) +#define io_btoct(x) ((__psunsigned_t)(x)>>IO_BPCSHIFT) + +/* off_t bytes to clicks */ +#define offtoc(x) (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT) +#define offtoct(x) ((xfs_off_t)(x)>>BPCSHIFT) + +/* clicks to off_t bytes */ +#define ctooff(x) ((xfs_off_t)(x)<>BPCSHIFT) +#define ctob64(x) ((__uint64_t)(x)<>BPCSHIFT) + +#ifndef CELL_CAPABLE +#define FSC_NOTIFY_NAME_CHANGED(vp) +#endif + +#ifndef ENOATTR +#define ENOATTR ENODATA /* Attribute not found */ +#endif + +/* Note: EWRONGFS never visible outside the kernel */ +#define EWRONGFS EINVAL /* Mount with wrong filesystem type */ + +/* + * XXX EFSCORRUPTED needs a real value in errno.h. asm-i386/errno.h won't + * return codes out of its known range in errno. + * XXX Also note: needs to be < 1000 and fairly unique on Linux (mustn't + * conflict with any code we use already or any code a driver may use) + * XXX Some options (currently we do #2): + * 1/ New error code ["Filesystem is corrupted", _after_ glibc updated] + * 2/ 990 ["Unknown error 990"] + * 3/ EUCLEAN ["Structure needs cleaning"] + * 4/ Convert EFSCORRUPTED to EIO [just prior to return into userspace] + */ +#define EFSCORRUPTED 990 /* Filesystem is corrupted */ + +#define SYNCHRONIZE() barrier() +#define lbolt jiffies +#define rootdev ROOT_DEV +#define __return_address __builtin_return_address(0) +#define LONGLONG_MAX 9223372036854775807LL /* max "long long int" */ +#define nopkg() ( ENOSYS ) + +/* IRIX uses a dynamic sizing algorithm (ndquot = 200 + numprocs*2) */ +/* we may well need to fine-tune this if it ever becomes an issue. */ +#define DQUOT_MAX_HEURISTIC 1024 /* NR_DQUOTS */ +#define ndquot DQUOT_MAX_HEURISTIC + +/* IRIX uses the current size of the name cache to guess a good value */ +/* - this isn't the same but is a good enough starting point for now. */ +#define DQUOT_HASH_HEURISTIC files_stat.nr_files + +/* IRIX inodes maintain the project ID also, zero this field on Linux */ +#define DEFAULT_PROJID 0 +#define dfltprid DEFAULT_PROJID + +#define MAXNAMELEN 256 +#define MAXPATHLEN 1024 + +#define FINVIS 0x0100 /* don't update timestamps - XFS */ + +#define MIN(a,b) (min(a,b)) +#define MAX(a,b) (max(a,b)) +#define howmany(x, y) (((x)+((y)-1))/(y)) +#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) + +/* Move the kernel do_div definition off to one side */ + +#if defined __i386__ +/* For ia32 we need to pull some tricks to get past various versions + * of the compiler which do not like us using do_div in the middle + * of large functions. + */ +static inline __u32 xfs_do_div(void *a, __u32 b, int n) +{ + __u32 mod; + + switch (n) { + case 4: + mod = *(__u32 *)a % b; + *(__u32 *)a = *(__u32 *)a / b; + return mod; + case 8: + { + unsigned long __upper, __low, __high, __mod; + __u64 c = *(__u64 *)a; + __upper = __high = c >> 32; + __low = c; + if (__high) { + __upper = __high % (b); + __high = __high / (b); + } + asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper)); + asm("":"=A" (c):"a" (__low),"d" (__high)); + *(__u64 *)a = c; + return __mod; + } + } + + /* NOTREACHED */ + return 0; +} + +/* Side effect free 64 bit mod operation */ +static inline __u32 xfs_do_mod(void *a, __u32 b, int n) +{ + switch (n) { + case 4: + return *(__u32 *)a % b; + case 8: + { + unsigned long __upper, __low, __high, __mod; + __u64 c = *(__u64 *)a; + __upper = __high = c >> 32; + __low = c; + if (__high) { + __upper = __high % (b); + __high = __high / (b); + } + asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper)); + asm("":"=A" (c):"a" (__low),"d" (__high)); + return __mod; + } + } + + /* NOTREACHED */ + return 0; +} +#else +static inline __u32 xfs_do_div(void *a, __u32 b, int n) +{ + __u32 mod; + + switch (n) { + case 4: + mod = *(__u32 *)a % b; + *(__u32 *)a = *(__u32 *)a / b; + return mod; + case 8: + mod = do_div(*(__u64 *)a, b); + return mod; + } + + /* NOTREACHED */ + return 0; +} + +/* Side effect free 64 bit mod operation */ +static inline __u32 xfs_do_mod(void *a, __u32 b, int n) +{ + switch (n) { + case 4: + return *(__u32 *)a % b; + case 8: + { + __u64 c = *(__u64 *)a; + return do_div(c, b); + } + } + + /* NOTREACHED */ + return 0; +} +#endif + +#undef do_div +#define do_div(a, b) xfs_do_div(&(a), (b), sizeof(a)) +#define do_mod(a, b) xfs_do_mod(&(a), (b), sizeof(a)) + +static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y) +{ + x += y - 1; + do_div(x, y); + return(x * y); +} + +#endif /* __XFS_LINUX__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_lrw.c linux-2.4-xfs/fs/xfs/linux/xfs_lrw.c --- linux-2.4.19/fs/xfs/linux/xfs_lrw.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_lrw.c Thu Sep 5 15:35:08 2002 @@ -0,0 +1,1812 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +/* + * fs/xfs/linux/xfs_lrw.c (Linux Read Write stuff) + * + */ + +#include +#include +#include + + +#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ + << mp->m_writeio_log) +#define XFS_STRAT_WRITE_IMAPS 2 + +STATIC int xfs_iomap_read(xfs_iocore_t *, loff_t, size_t, int, pb_bmap_t *, + int *, struct pm *); +STATIC int xfs_iomap_write(xfs_iocore_t *, loff_t, size_t, pb_bmap_t *, + int *, int, struct pm *); +STATIC int xfs_iomap_write_delay(xfs_iocore_t *, loff_t, size_t, pb_bmap_t *, + int *, int, int); +STATIC int xfs_iomap_write_direct(xfs_iocore_t *, loff_t, size_t, pb_bmap_t *, + int *, int, int); +STATIC int _xfs_imap_to_bmap(xfs_iocore_t *, xfs_off_t, xfs_bmbt_irec_t *, + pb_bmap_t *, int, int); + + +/* + * xfs_iozero + * + * xfs_iozero clears the specified range of buffer supplied, + * and marks all the affected blocks as valid and modified. If + * an affected block is not allocated, it will be allocated. If + * an affected block is not completely overwritten, and is not + * valid before the operation, it will be read from disk before + * being partially zeroed. + */ +STATIC int +xfs_iozero( + struct inode *ip, /* inode */ + loff_t pos, /* offset in file */ + size_t count, /* size of data to zero */ + loff_t end_size) /* max file size to set */ +{ + unsigned bytes; + struct page *page; + struct address_space *mapping; + char *kaddr; + int status; + + mapping = ip->i_mapping; + do { + unsigned long index, offset; + + offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + index = pos >> PAGE_CACHE_SHIFT; + bytes = PAGE_CACHE_SIZE - offset; + if (bytes > count) + bytes = count; + + status = -ENOMEM; + page = grab_cache_page(mapping, index); + if (!page) + break; + + kaddr = kmap(page); + status = mapping->a_ops->prepare_write(NULL, page, offset, + offset + bytes); + if (status) { + goto unlock; + } + + memset((void *) (kaddr + offset), 0, bytes); + flush_dcache_page(page); + status = mapping->a_ops->commit_write(NULL, page, offset, + offset + bytes); + if (!status) { + pos += bytes; + count -= bytes; + if (pos > ip->i_size) + ip->i_size = pos < end_size ? pos : end_size; + } + +unlock: + kunmap(page); + unlock_page(page); + page_cache_release(page); + if (status) + break; + } while (count); + + return (-status); +} + +ssize_t /* bytes read, or (-) error */ +xfs_read( + bhv_desc_t *bdp, + struct file *file, + char *buf, + size_t size, + loff_t *offset, + cred_t *credp) +{ + ssize_t ret; + xfs_fsize_t n; + xfs_inode_t *ip; + xfs_mount_t *mp; + + ip = XFS_BHVTOI(bdp); + mp = ip->i_mount; + + XFS_STATS_INC(xfsstats.xs_read_calls); + + if (file->f_flags & O_DIRECT) { + if (((__psint_t)buf & BBMASK) || + (*offset & mp->m_blockmask) || + (size & mp->m_blockmask)) { + if (*offset == ip->i_d.di_size) { + return (0); + } + return -XFS_ERROR(EINVAL); + } + } + + + n = XFS_MAX_FILE_OFFSET - *offset; + if ((n <= 0) || (size == 0)) + return 0; + + if (n < size) + size = n; + + if (XFS_FORCED_SHUTDOWN(mp)) { + return -EIO; + } + + xfs_ilock(ip, XFS_IOLOCK_SHARED); + + if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) && + !(file->f_mode & FINVIS)) { + int error; + vrwlock_t locktype = VRWLOCK_READ; + + error = xfs_dm_send_data_event(DM_EVENT_READ, bdp, + *offset, size, + FILP_DELAY_FLAG(file), + &locktype); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return -error; + } + } + + ret = generic_file_read(file, buf, size, offset); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + + XFS_STATS_ADD(xfsstats.xs_read_bytes, ret); + + if (!(file->f_mode & FINVIS)) + xfs_ichgtime(ip, XFS_ICHGTIME_ACC); + + return ret; +} + +/* + * This routine is called to handle zeroing any space in the last + * block of the file that is beyond the EOF. We do this since the + * size is being increased without writing anything to that block + * and we don't want anyone to read the garbage on the disk. + */ +STATIC int /* error (positive) */ +xfs_zero_last_block( + struct inode *ip, + xfs_iocore_t *io, + xfs_off_t offset, + xfs_fsize_t isize, + xfs_fsize_t end_size, + struct pm *pmp) +{ + xfs_fileoff_t last_fsb; + xfs_mount_t *mp; + int nimaps; + int zero_offset; + int zero_len; + int isize_fsb_offset; + int error = 0; + xfs_bmbt_irec_t imap; + loff_t loff; + size_t lsize; + + ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0); + ASSERT(offset > isize); + + mp = io->io_mount; + + isize_fsb_offset = XFS_B_FSB_OFFSET(mp, isize); + if (isize_fsb_offset == 0) { + /* + * There are no extra bytes in the last block on disk to + * zero, so return. + */ + return 0; + } + + last_fsb = XFS_B_TO_FSBT(mp, isize); + nimaps = 1; + error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, NULL, 0, &imap, + &nimaps, NULL); + if (error) { + return error; + } + ASSERT(nimaps > 0); + /* + * If the block underlying isize is just a hole, then there + * is nothing to zero. + */ + if (imap.br_startblock == HOLESTARTBLOCK) { + return 0; + } + /* + * Get a pagebuf for the last block, zero the part beyond the + * EOF, and write it out sync. We need to drop the ilock + * while we do this so we don't deadlock when the buffer cache + * calls back to us. + */ + XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD); + loff = XFS_FSB_TO_B(mp, last_fsb); + lsize = XFS_FSB_TO_B(mp, 1); + + zero_offset = isize_fsb_offset; + zero_len = mp->m_sb.sb_blocksize - isize_fsb_offset; + + error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size); + + XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); + ASSERT(error >= 0); + return error; +} + +/* + * Zero any on disk space between the current EOF and the new, + * larger EOF. This handles the normal case of zeroing the remainder + * of the last block in the file and the unusual case of zeroing blocks + * out beyond the size of the file. This second case only happens + * with fixed size extents and when the system crashes before the inode + * size was updated but after blocks were allocated. If fill is set, + * then any holes in the range are filled and zeroed. If not, the holes + * are left alone as holes. + */ + +int /* error (positive) */ +xfs_zero_eof( + vnode_t *vp, + xfs_iocore_t *io, + xfs_off_t offset, /* starting I/O offset */ + xfs_fsize_t isize, /* current inode size */ + xfs_fsize_t end_size, /* terminal inode size */ + struct pm *pmp) +{ + struct inode *ip = LINVFS_GET_IP(vp); + xfs_fileoff_t start_zero_fsb; + xfs_fileoff_t end_zero_fsb; + xfs_fileoff_t prev_zero_fsb; + xfs_fileoff_t zero_count_fsb; + xfs_fileoff_t last_fsb; + xfs_extlen_t buf_len_fsb; + xfs_extlen_t prev_zero_count; + xfs_mount_t *mp; + int nimaps; + int error = 0; + xfs_bmbt_irec_t imap; + loff_t loff; + size_t lsize; + + ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); + ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); + + mp = io->io_mount; + + /* + * First handle zeroing the block on which isize resides. + * We only zero a part of that block so it is handled specially. + */ + error = xfs_zero_last_block(ip, io, offset, isize, end_size, pmp); + if (error) { + ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); + ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); + return error; + } + + /* + * Calculate the range between the new size and the old + * where blocks needing to be zeroed may exist. To get the + * block where the last byte in the file currently resides, + * we need to subtract one from the size and truncate back + * to a block boundary. We subtract 1 in case the size is + * exactly on a block boundary. + */ + last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; + start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); + end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); + + ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); + if (last_fsb == end_zero_fsb) { + /* + * The size was only incremented on its last block. + * We took care of that above, so just return. + */ + return 0; + } + + ASSERT(start_zero_fsb <= end_zero_fsb); + prev_zero_fsb = NULLFILEOFF; + prev_zero_count = 0; + /* + * Maybe change this loop to do the bmapi call and + * loop while we split the mappings into pagebufs? + */ + while (start_zero_fsb <= end_zero_fsb) { + nimaps = 1; + zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; + error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb, + 0, NULL, 0, &imap, &nimaps, NULL); + if (error) { + ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); + ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); + return error; + } + ASSERT(nimaps > 0); + + if (imap.br_startblock == HOLESTARTBLOCK) { + /* + * This loop handles initializing pages that were + * partially initialized by the code below this + * loop. It basically zeroes the part of the page + * that sits on a hole and sets the page as P_HOLE + * and calls remapf if it is a mapped file. + */ + prev_zero_fsb = NULLFILEOFF; + prev_zero_count = 0; + start_zero_fsb = imap.br_startoff + + imap.br_blockcount; + ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); + continue; + } + + /* + * There are blocks in the range requested. + * Zero them a single write at a time. We actually + * don't zero the entire range returned if it is + * too big and simply loop around to get the rest. + * That is not the most efficient thing to do, but it + * is simple and this path should not be exercised often. + */ + buf_len_fsb = XFS_FILBLKS_MIN(imap.br_blockcount, + mp->m_writeio_blocks << 8); + /* + * Drop the inode lock while we're doing the I/O. + * We'll still have the iolock to protect us. + */ + XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); + + loff = XFS_FSB_TO_B(mp, start_zero_fsb); + lsize = XFS_FSB_TO_B(mp, buf_len_fsb); + + error = xfs_iozero(ip, loff, lsize, end_size); + + if (error) { + goto out_lock; + } + + prev_zero_fsb = start_zero_fsb; + prev_zero_count = buf_len_fsb; + start_zero_fsb = imap.br_startoff + buf_len_fsb; + ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); + + XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); + } + + return 0; + +out_lock: + + XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); + ASSERT(error >= 0); + return error; +} + +ssize_t /* bytes written, or (-) error */ +xfs_write( + bhv_desc_t *bdp, + struct file *file, + const char *buf, + size_t size, + loff_t *offset, + cred_t *credp) +{ + xfs_inode_t *xip; + xfs_mount_t *mp; + ssize_t ret; + int error = 0; + xfs_fsize_t isize, new_size; + xfs_fsize_t n, limit = XFS_MAX_FILE_OFFSET; + xfs_iocore_t *io; + vnode_t *vp; + int iolock; + int direct = file->f_flags & O_DIRECT; + int eventsent = 0; + vrwlock_t locktype; + + XFS_STATS_INC(xfsstats.xs_write_calls); + + vp = BHV_TO_VNODE(bdp); + xip = XFS_BHVTOI(bdp); + + if (size == 0) + return 0; + + io = &(xip->i_iocore); + mp = io->io_mount; + + xfs_check_frozen(mp, bdp, XFS_FREEZE_WRITE); + + if (XFS_FORCED_SHUTDOWN(xip->i_mount)) { + return -EIO; + } + + if (direct) { + if (((__psint_t)buf & BBMASK) || + (*offset & mp->m_blockmask) || + (size & mp->m_blockmask)) { + return XFS_ERROR(-EINVAL); + } + iolock = XFS_IOLOCK_SHARED; + locktype = VRWLOCK_WRITE_DIRECT; + } else { + iolock = XFS_IOLOCK_EXCL; + locktype = VRWLOCK_WRITE; + } + + xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); + isize = xip->i_d.di_size; + + if (file->f_flags & O_APPEND) + *offset = isize; + +start: + n = limit - *offset; + if (n <= 0) { + xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); + return -EFBIG; + } + if (n < size) + size = n; + + new_size = *offset + size; + if (new_size > isize) { + io->io_new_size = new_size; + } + + if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) && + !(file->f_mode & FINVIS) && !eventsent)) { + loff_t savedsize = *offset; + + xfs_iunlock(xip, XFS_ILOCK_EXCL); + error = xfs_dm_send_data_event(DM_EVENT_WRITE, bdp, + *offset, size, + FILP_DELAY_FLAG(file), &locktype); + if (error) { + xfs_iunlock(xip, iolock); + return -error; + } + xfs_ilock(xip, XFS_ILOCK_EXCL); + eventsent = 1; + + /* + * The iolock was dropped and reaquired in + * xfs_dm_send_data_event so we have to recheck the size + * when appending. We will only "goto start;" once, + * since having sent the event prevents another call + * to xfs_dm_send_data_event, which is what + * allows the size to change in the first place. + */ + if ((file->f_flags & O_APPEND) && + savedsize != xip->i_d.di_size) { + *offset = isize = xip->i_d.di_size; + goto start; + } + } + + /* + * On Linux, generic_file_write updates the times even if + * no data is copied in so long as the write had a size. + * + * We must update xfs' times since revalidate will overcopy xfs. + */ + if (size) { + if (!(file->f_mode & FINVIS)) + xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + } + + /* + * If the offset is beyond the size of the file, we have a couple + * of things to do. First, if there is already space allocated + * we need to either create holes or zero the disk or ... + * + * If there is a page where the previous size lands, we need + * to zero it out up to the new size. + */ + + if (!direct && (*offset > isize && isize)) { + error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offset, + isize, *offset + size, NULL); + if (error) { + xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); + return(-error); + } + } + xfs_iunlock(xip, XFS_ILOCK_EXCL); + + /* + * If we're writing the file then make sure to clear the + * setuid and setgid bits if the process is not being run + * by root. This keeps people from modifying setuid and + * setgid binaries. + */ + + if (((xip->i_d.di_mode & ISUID) || + ((xip->i_d.di_mode & (ISGID | (IEXEC >> 3))) == + (ISGID | (IEXEC >> 3)))) && + !capable(CAP_FSETID)) { + error = xfs_write_clear_setuid(xip); + if (error) { + xfs_iunlock(xip, iolock); + return -error; + } + } + +retry: + if (direct) { + xfs_inval_cached_pages(vp, &xip->i_iocore, *offset, 1, 1); + } + + ret = do_generic_file_write(file, buf, size, offset); + + if ((ret == -ENOSPC) && + DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) && + !(file->f_mode & FINVIS)) { + + xfs_rwunlock(bdp, locktype); + error = dm_send_namesp_event(DM_EVENT_NOSPACE, bdp, + DM_RIGHT_NULL, bdp, DM_RIGHT_NULL, NULL, NULL, + 0, 0, 0); /* Delay flag intentionally unused */ + if (error) + return -error; + xfs_rwlock(bdp, locktype); + *offset = xip->i_d.di_size; + goto retry; + + } + + if (ret <= 0) { + xfs_rwunlock(bdp, locktype); + return ret; + } + + XFS_STATS_ADD(xfsstats.xs_write_bytes, ret); + + if (*offset > xip->i_d.di_size) { + xfs_ilock(xip, XFS_ILOCK_EXCL); + if (*offset > xip->i_d.di_size) { + struct inode *inode = LINVFS_GET_IP(vp); + + inode->i_size = xip->i_d.di_size = *offset; + xip->i_update_core = 1; + xip->i_update_size = 1; + } + xfs_iunlock(xip, XFS_ILOCK_EXCL); + } + + /* Handle various SYNC-type writes */ + if ((file->f_flags & O_SYNC) || IS_SYNC(file->f_dentry->d_inode)) { + + /* + * If we're treating this as O_DSYNC and we have not updated the + * size, force the log. + */ + + if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) + && !(xip->i_update_size)) { + /* + * If an allocation transaction occurred + * without extending the size, then we have to force + * the log up the proper point to ensure that the + * allocation is permanent. We can't count on + * the fact that buffered writes lock out direct I/O + * writes - the direct I/O write could have extended + * the size nontransactionally, then finished before + * we started. xfs_write_file will think that the file + * didn't grow but the update isn't safe unless the + * size change is logged. + * + * Force the log if we've committed a transaction + * against the inode or if someone else has and + * the commit record hasn't gone to disk (e.g. + * the inode is pinned). This guarantees that + * all changes affecting the inode are permanent + * when we return. + */ + + xfs_inode_log_item_t *iip; + xfs_lsn_t lsn; + + iip = xip->i_itemp; + if (iip && iip->ili_last_lsn) { + lsn = iip->ili_last_lsn; + xfs_log_force(mp, lsn, + XFS_LOG_FORCE | XFS_LOG_SYNC); + } else if (xfs_ipincount(xip) > 0) { + xfs_log_force(mp, (xfs_lsn_t)0, + XFS_LOG_FORCE | XFS_LOG_SYNC); + } + + } else { + xfs_trans_t *tp; + + /* + * O_SYNC or O_DSYNC _with_ a size update are handled + * the same way. + * + * If the write was synchronous then we need to make + * sure that the inode modification time is permanent. + * We'll have updated the timestamp above, so here + * we use a synchronous transaction to log the inode. + * It's not fast, but it's necessary. + * + * If this a dsync write and the size got changed + * non-transactionally, then we need to ensure that + * the size change gets logged in a synchronous + * transaction. + */ + + tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC); + if ((error = xfs_trans_reserve(tp, 0, + XFS_SWRITE_LOG_RES(mp), + 0, 0, 0))) { + /* Transaction reserve failed */ + xfs_trans_cancel(tp, 0); + } else { + /* Transaction reserve successful */ + xfs_ilock(xip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, xip); + xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE); + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0, (xfs_lsn_t)0); + xfs_iunlock(xip, XFS_ILOCK_EXCL); + } + } + } /* (ioflags & O_SYNC) */ + + /* + * If we are coming from an nfsd thread then insert into the + * reference cache. + */ + + if (!strcmp(current->comm, "nfsd")) + xfs_refcache_insert(xip); + + /* Drop lock this way - the old refcache release is in here */ + xfs_rwunlock(bdp, locktype); + + return(ret); +} + +/* + * xfs_bmap() is the same as the irix xfs_bmap from xfs_rw.c + * execpt for slight changes to the params + */ +int +xfs_bmap(bhv_desc_t *bdp, + xfs_off_t offset, + ssize_t count, + int flags, + struct cred *cred, + pb_bmap_t *pbmapp, + int *npbmaps) +{ + xfs_inode_t *ip; + int error; + int lockmode; + int fsynced = 0; + vnode_t *vp; + + ip = XFS_BHVTOI(bdp); + ASSERT((ip->i_d.di_mode & IFMT) == IFREG); + ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) == + ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0)); + ASSERT((flags & PBF_READ) || (flags & PBF_WRITE)); + + if (XFS_FORCED_SHUTDOWN(ip->i_iocore.io_mount)) + return XFS_ERROR(EIO); + + if (flags & PBF_READ) { + lockmode = xfs_ilock_map_shared(ip); + error = xfs_iomap_read(&ip->i_iocore, offset, count, + XFS_BMAPI_ENTIRE, pbmapp, npbmaps, NULL); + xfs_iunlock_map_shared(ip, lockmode); + } else { /* PBF_WRITE */ + ASSERT(flags & PBF_WRITE); + vp = BHV_TO_VNODE(bdp); + xfs_ilock(ip, XFS_ILOCK_EXCL); + + /* + * Make sure that the dquots are there. This doesn't hold + * the ilock across a disk read. + */ + + if (XFS_IS_QUOTA_ON(ip->i_mount)) { + if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) { + if ((error = xfs_qm_dqattach(ip, XFS_QMOPT_ILOCKED))) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return XFS_ERROR(error); + } + } + } +retry: + error = xfs_iomap_write(&ip->i_iocore, offset, count, + pbmapp, npbmaps, flags, NULL); + /* xfs_iomap_write unlocks/locks/unlocks */ + + if (error == ENOSPC) { + switch (fsynced) { + case 0: + if (ip->i_delayed_blks) { + fsync_inode_data_buffers(LINVFS_GET_IP(vp)); + fsynced = 1; + } else { + fsynced = 2; + flags |= PBF_SYNC; + } + error = 0; + xfs_ilock(ip, XFS_ILOCK_EXCL); + goto retry; + case 1: + fsynced = 2; + if (!(flags & PBF_SYNC)) { + flags |= PBF_SYNC; + error = 0; + xfs_ilock(ip, XFS_ILOCK_EXCL); + goto retry; + } + case 2: + fsync_no_super(LINVFS_GET_IP(vp)->i_dev); + xfs_log_force(ip->i_mount, (xfs_lsn_t)0, + XFS_LOG_FORCE|XFS_LOG_SYNC); + + error = 0; +/** + delay(HZ); +**/ + fsynced++; + xfs_ilock(ip, XFS_ILOCK_EXCL); + goto retry; + } + } + } + + return XFS_ERROR(error); +} + +int +xfs_strategy(bhv_desc_t *bdp, + xfs_off_t offset, + ssize_t count, + int flags, + struct cred *cred, + pb_bmap_t *pbmapp, + int *npbmaps) +{ + xfs_inode_t *ip; + xfs_iocore_t *io; + xfs_mount_t *mp; + int error; + xfs_fileoff_t offset_fsb; + xfs_fileoff_t end_fsb; + xfs_fileoff_t map_start_fsb; + xfs_fileoff_t last_block; + xfs_fsblock_t first_block; + xfs_bmap_free_t free_list; + xfs_filblks_t count_fsb; + int committed, i, loops, nimaps; + int is_xfs = 1; /* This will be a variable at some point */ + xfs_bmbt_irec_t imap[XFS_MAX_RW_NBMAPS]; + xfs_trans_t *tp; + + ip = XFS_BHVTOI(bdp); + io = &ip->i_iocore; + mp = ip->i_mount; + /* is_xfs = IO_IS_XFS(io); */ + ASSERT((ip->i_d.di_mode & IFMT) == IFREG); + ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) == + ((io->io_flags & XFS_IOCORE_RT) != 0)); + ASSERT((flags & PBF_READ) || (flags & PBF_WRITE)); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + ASSERT(flags & PBF_WRITE); + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + nimaps = min(XFS_MAX_RW_NBMAPS, *npbmaps); + end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); + first_block = NULLFSBLOCK; + + XFS_ILOCK(mp, io, XFS_ILOCK_SHARED | XFS_EXTSIZE_RD); + error = XFS_BMAPI(mp, NULL, io, offset_fsb, + (xfs_filblks_t)(end_fsb - offset_fsb), + XFS_BMAPI_ENTIRE, &first_block, 0, imap, + &nimaps, NULL); + XFS_IUNLOCK(mp, io, XFS_ILOCK_SHARED | XFS_EXTSIZE_RD); + if (error) { + return XFS_ERROR(error); + } + + if (nimaps && !ISNULLSTARTBLOCK(imap[0].br_startblock)) { + *npbmaps = _xfs_imap_to_bmap(&ip->i_iocore, offset, imap, + pbmapp, nimaps, *npbmaps); + return 0; + } + + /* + * Make sure that the dquots are there. + */ + + if (XFS_IS_QUOTA_ON(mp)) { + if (XFS_NOT_DQATTACHED(mp, ip)) { + if ((error = xfs_qm_dqattach(ip, 0))) { + return XFS_ERROR(error); + } + } + } + XFS_STATS_ADD(xfsstats.xs_xstrat_bytes, + XFS_FSB_TO_B(mp, imap[0].br_blockcount)); + + offset_fsb = imap[0].br_startoff; + count_fsb = imap[0].br_blockcount; + map_start_fsb = offset_fsb; + while (count_fsb != 0) { + /* + * Set up a transaction with which to allocate the + * backing store for the file. Do allocations in a + * loop until we get some space in the range we are + * interested in. The other space that might be allocated + * is in the delayed allocation extent on which we sit + * but before our buffer starts. + */ + nimaps = 0; + loops = 0; + while (nimaps == 0) { + if (is_xfs) { + tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); + error = xfs_trans_reserve(tp, 0, + XFS_WRITE_LOG_RES(mp), + 0, XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + if (error) { + xfs_trans_cancel(tp, 0); + goto error0; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, + XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + } else { + tp = NULL; + XFS_ILOCK(mp, io, XFS_ILOCK_EXCL | + XFS_EXTSIZE_WR); + } + + + /* + * Allocate the backing store for the file. + */ + XFS_BMAP_INIT(&(free_list), + &(first_block)); + nimaps = XFS_STRAT_WRITE_IMAPS; + + /* + * Ensure we don't go beyond eof - it is possible + * the extents changed since we did the read call, + * we dropped the ilock in the interim. + */ + + end_fsb = XFS_B_TO_FSB(mp, XFS_SIZE(mp, io)); + xfs_bmap_last_offset(NULL, ip, &last_block, + XFS_DATA_FORK); + last_block = XFS_FILEOFF_MAX(last_block, end_fsb); + if ((map_start_fsb + count_fsb) > last_block) { + count_fsb = last_block - map_start_fsb; + if (count_fsb == 0) { + if (is_xfs) { + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp, + (XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT)); + } + XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL | + XFS_EXTSIZE_WR); + return XFS_ERROR(EAGAIN); + } + } + + error = XFS_BMAPI(mp, tp, io, map_start_fsb, count_fsb, + XFS_BMAPI_WRITE, &first_block, 1, + imap, &nimaps, &free_list); + if (error) { + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp, + (XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT)); + XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL | + XFS_EXTSIZE_WR); + + goto error0; + } + + if (is_xfs) { + error = xfs_bmap_finish(&(tp), &(free_list), + first_block, &committed); + if (error) { + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp, + (XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + goto error0; + } + + error = xfs_trans_commit(tp, + XFS_TRANS_RELEASE_LOG_RES, + NULL); + if (error) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + goto error0; + } + } + + if (nimaps == 0) { + XFS_IUNLOCK(mp, io, + XFS_ILOCK_EXCL|XFS_EXTSIZE_WR); + } /* else hold 'till we maybe loop again below */ + } + + /* + * See if we were able to allocate an extent that + * covers at least part of the user's requested size. + */ + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + for(i = 0; i < nimaps; i++) { + int maps; + if (offset_fsb >= imap[i].br_startoff && + (offset_fsb < (imap[i].br_startoff + imap[i].br_blockcount))) { + XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL | XFS_EXTSIZE_WR); + maps = min(nimaps, *npbmaps); + *npbmaps = _xfs_imap_to_bmap(io, offset, &imap[i], + pbmapp, maps, *npbmaps); + XFS_STATS_INC(xfsstats.xs_xstrat_quick); + return 0; + } + count_fsb -= imap[i].br_blockcount; /* for next bmapi, + if needed. */ + } + + /* + * We didn't get an extent the caller can write into so + * loop around and try starting after the last imap we got back. + */ + + nimaps--; /* Index of last entry */ + ASSERT(nimaps >= 0); + ASSERT(offset_fsb >= imap[nimaps].br_startoff + imap[nimaps].br_blockcount); + ASSERT(count_fsb); + offset_fsb = imap[nimaps].br_startoff + imap[nimaps].br_blockcount; + map_start_fsb = offset_fsb; + XFS_STATS_INC(xfsstats.xs_xstrat_split); + XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_WR); + } + + ASSERT(0); /* Should never get here */ + + error0: + if (error) { + ASSERT(count_fsb != 0); + ASSERT(is_xfs || XFS_FORCED_SHUTDOWN(mp)); + } + + return XFS_ERROR(error); +} + + +STATIC int +_xfs_imap_to_bmap( + xfs_iocore_t *io, + xfs_off_t offset, + xfs_bmbt_irec_t *imap, + pb_bmap_t *pbmapp, + int imaps, /* Number of imap entries */ + int pbmaps) /* Number of pbmap entries */ +{ + xfs_mount_t *mp; + xfs_fsize_t nisize; + int im, pbm; + xfs_fsblock_t start_block; + + mp = io->io_mount; + nisize = XFS_SIZE(mp, io); + if (io->io_new_size > nisize) + nisize = io->io_new_size; + + for (im=0, pbm=0; im < imaps && pbm < pbmaps; im++,pbmapp++,imap++,pbm++) { + pbmapp->pbm_target = io->io_flags & XFS_IOCORE_RT ? + mp->m_rtdev_targp : + mp->m_ddev_targp; + pbmapp->pbm_offset = XFS_FSB_TO_B(mp, imap->br_startoff); + pbmapp->pbm_delta = offset - pbmapp->pbm_offset; + pbmapp->pbm_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount); + pbmapp->pbm_flags = 0; + + start_block = imap->br_startblock; + if (start_block == HOLESTARTBLOCK) { + pbmapp->pbm_bn = PAGE_BUF_DADDR_NULL; + pbmapp->pbm_flags = PBMF_HOLE; + } else if (start_block == DELAYSTARTBLOCK) { + pbmapp->pbm_bn = PAGE_BUF_DADDR_NULL; + pbmapp->pbm_flags = PBMF_DELAY; + } else { + pbmapp->pbm_bn = XFS_FSB_TO_DB_IO(io, start_block); + if (imap->br_state == XFS_EXT_UNWRITTEN) + pbmapp->pbm_flags |= PBMF_UNWRITTEN; + } + + if ((pbmapp->pbm_offset + pbmapp->pbm_bsize) >= nisize) { + pbmapp->pbm_flags |= PBMF_EOF; + } + + offset += pbmapp->pbm_bsize - pbmapp->pbm_delta; + } + return(pbm); /* Return the number filled */ +} + +STATIC int +xfs_iomap_read( + xfs_iocore_t *io, + loff_t offset, + size_t count, + int flags, + pb_bmap_t *pbmapp, + int *npbmaps, + struct pm *pmp) +{ + xfs_fileoff_t offset_fsb; + xfs_fileoff_t end_fsb; + int nimaps; + int error; + xfs_mount_t *mp; + xfs_bmbt_irec_t imap[XFS_MAX_RW_NBMAPS]; + + ASSERT(ismrlocked(io->io_lock, MR_UPDATE | MR_ACCESS) != 0); +/** ASSERT(ismrlocked(io->io_iolock, MR_UPDATE | MR_ACCESS) != 0); **/ +/* xfs_iomap_enter_trace(XFS_IOMAP_READ_ENTER, io, offset, count); */ + + mp = io->io_mount; + offset_fsb = XFS_B_TO_FSBT(mp, offset); + nimaps = sizeof(imap) / sizeof(imap[0]); + nimaps = min(nimaps, *npbmaps); /* Don't ask for more than caller has */ + end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); + error = XFS_BMAPI(mp, NULL, io, offset_fsb, + (xfs_filblks_t)(end_fsb - offset_fsb), + flags, NULL, 0, imap, + &nimaps, NULL); + if (error) { + return XFS_ERROR(error); + } + + if(nimaps) { + *npbmaps = _xfs_imap_to_bmap(io, offset, imap, pbmapp, nimaps, + *npbmaps); + } else + *npbmaps = 0; + return XFS_ERROR(error); +} + +/* + * xfs_iomap_write: return pagebuf_bmap_t's telling higher layers + * where to write. + * There are 2 main cases: + * 1 the extents already exist + * 2 must allocate. + * There are 3 cases when we allocate: + * delay allocation (doesn't really allocate or use transactions) + * direct allocation (no previous delay allocation + * convert delay to real allocations + */ + +STATIC int +xfs_iomap_write( + xfs_iocore_t *io, + loff_t offset, + size_t count, + pb_bmap_t *pbmapp, + int *npbmaps, + int ioflag, + struct pm *pmp) +{ + int maps; + int error = 0; + int found; + int flags = 0; + + maps = *npbmaps; + if (!maps) + goto out; + + /* + * If we have extents that are allocated for this range, + * return them. + */ + + found = 0; + error = xfs_iomap_read(io, offset, count, flags, pbmapp, npbmaps, NULL); + if (error) + goto out; + + /* + * If we found mappings and they can just have data written + * without conversion, + * let the caller write these and call us again. + * + * If we have a HOLE or UNWRITTEN, proceed down lower to + * get the space or to convert to written. + */ + + if (*npbmaps) { + if (!(pbmapp->pbm_flags & PBMF_HOLE)) { + *npbmaps = 1; /* Only checked the first one. */ + /* We could check more, ... */ + goto out; + } + } + found = *npbmaps; + *npbmaps = maps; /* Restore to original requested */ + + if (ioflag & PBF_DIRECT) { + error = xfs_iomap_write_direct(io, offset, count, pbmapp, + npbmaps, ioflag, found); + } else { + error = xfs_iomap_write_delay(io, offset, count, pbmapp, + npbmaps, ioflag, found); + } + +out: + XFS_IUNLOCK(io->io_mount, io, XFS_ILOCK_EXCL); + return XFS_ERROR(error); +} + +/* + * Map the given I/O size and I/O alignment over the given extent. + * If we're at the end of the file and the underlying extent is + * delayed alloc, make sure we extend out to the + * next i_writeio_blocks boundary. Otherwise make sure that we + * are confined to the given extent. + */ +/*ARGSUSED*/ +STATIC void +xfs_write_bmap( + xfs_mount_t *mp, + xfs_iocore_t *io, + xfs_bmbt_irec_t *imapp, + pb_bmap_t *pbmapp, + int iosize, + xfs_fileoff_t ioalign, + xfs_fsize_t isize) +{ + __int64_t extra_blocks; + xfs_fileoff_t size_diff; + xfs_fileoff_t ext_offset; + xfs_fsblock_t start_block; + int length; /* length of this mapping in blocks */ + xfs_off_t offset; /* logical block offset of this mapping */ + + if (ioalign < imapp->br_startoff) { + /* + * The desired alignment doesn't end up on this + * extent. Move up to the beginning of the extent. + * Subtract whatever we drop from the iosize so that + * we stay aligned on iosize boundaries. + */ + size_diff = imapp->br_startoff - ioalign; + iosize -= (int)size_diff; + ASSERT(iosize > 0); + ext_offset = 0; + offset = imapp->br_startoff; + pbmapp->pbm_offset = XFS_FSB_TO_B(mp, imapp->br_startoff); + } else { + /* + * The alignment requested fits on this extent, + * so use it. + */ + ext_offset = ioalign - imapp->br_startoff; + offset = ioalign; + pbmapp->pbm_offset = XFS_FSB_TO_B(mp, ioalign); + } + start_block = imapp->br_startblock; + ASSERT(start_block != HOLESTARTBLOCK); + if (start_block != DELAYSTARTBLOCK) { + pbmapp->pbm_bn = XFS_FSB_TO_DB_IO(io, start_block + ext_offset); + if (imapp->br_state == XFS_EXT_UNWRITTEN) { + pbmapp->pbm_flags = PBMF_UNWRITTEN; + } + } else { + pbmapp->pbm_bn = PAGE_BUF_DADDR_NULL; + pbmapp->pbm_flags = PBMF_DELAY; + } + pbmapp->pbm_target = io->io_flags & XFS_IOCORE_RT ? + mp->m_rtdev_targp : + mp->m_ddev_targp; + length = iosize; + + /* + * If the iosize from our offset extends beyond the end of + * the extent, then trim down length to match that of the extent. + */ + extra_blocks = (xfs_off_t)(offset + length) - + (__uint64_t)(imapp->br_startoff + + imapp->br_blockcount); + if (extra_blocks > 0) { + length -= extra_blocks; + ASSERT(length > 0); + } + + pbmapp->pbm_bsize = XFS_FSB_TO_B(mp, length); +} + +STATIC int +xfs_iomap_write_delay( + xfs_iocore_t *io, + loff_t offset, + size_t count, + pb_bmap_t *pbmapp, + int *npbmaps, + int ioflag, + int found) +{ + xfs_fileoff_t offset_fsb; + xfs_fileoff_t ioalign; + xfs_fileoff_t last_fsb; + xfs_fileoff_t start_fsb; + xfs_filblks_t count_fsb; + xfs_off_t aligned_offset; + xfs_fsize_t isize; + xfs_fsblock_t firstblock; + __uint64_t last_page_offset; + int nimaps; + int error; + int n; + unsigned int iosize; + short small_write; + xfs_mount_t *mp; +#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP + xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; + int aeof; + + ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0); + +/* xfs_iomap_enter_trace(XFS_IOMAP_WRITE_ENTER, io, offset, count); */ + + mp = io->io_mount; +/*** + ASSERT(! XFS_NOT_DQATTACHED(mp, ip)); +***/ + + isize = XFS_SIZE(mp, io); + if (io->io_new_size > isize) { + isize = io->io_new_size; + } + + aeof = 0; + offset_fsb = XFS_B_TO_FSBT(mp, offset); + last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); + /* + * If the caller is doing a write at the end of the file, + * then extend the allocation (and the buffer used for the write) + * out to the file system's write iosize. We clean up any extra + * space left over when the file is closed in xfs_inactive(). + * We can only do this if we are sure that we will create buffers + * over all of the space we allocate beyond the end of the file. + * Not doing so would allow us to create delalloc blocks with + * no pages in memory covering them. So, we need to check that + * there are not any real blocks in the area beyond the end of + * the file which we are optimistically going to preallocate. If + * there are then our buffers will stop when they encounter them + * and we may accidentally create delalloc blocks beyond them + * that we never cover with a buffer. All of this is because + * we are not actually going to write the extra blocks preallocated + * at this point. + * + * We don't bother with this for sync writes, because we need + * to minimize the amount we write for good performance. + */ + if (!(ioflag & PBF_SYNC) && ((offset + count) > XFS_SIZE(mp, io))) { + start_fsb = XFS_B_TO_FSBT(mp, + ((xfs_ufsize_t)(offset + count - 1))); + count_fsb = mp->m_writeio_blocks; + while (count_fsb > 0) { + nimaps = XFS_WRITE_IMAPS; + error = XFS_BMAPI(mp, NULL, io, start_fsb, count_fsb, + 0, NULL, 0, imap, &nimaps, + NULL); + if (error) { + return error; + } + for (n = 0; n < nimaps; n++) { + if ((imap[n].br_startblock != HOLESTARTBLOCK) && + (imap[n].br_startblock != DELAYSTARTBLOCK)) { + goto write_map; + } + start_fsb += imap[n].br_blockcount; + count_fsb -= imap[n].br_blockcount; + ASSERT(count_fsb < 0xffff000); + } + } + iosize = mp->m_writeio_blocks; + aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); + ioalign = XFS_B_TO_FSBT(mp, aligned_offset); + last_fsb = ioalign + iosize; + aeof = 1; + } + write_map: + nimaps = XFS_WRITE_IMAPS; + firstblock = NULLFSBLOCK; + + /* + * roundup the allocation request to m_dalign boundary if file size + * is greater that 512K and we are allocating past the allocation eof + */ + if (mp->m_dalign && (XFS_SIZE(mp, io) >= mp->m_dalign) && aeof) { + int eof; + xfs_fileoff_t new_last_fsb; + new_last_fsb = roundup_64(last_fsb, mp->m_dalign); + error = XFS_BMAP_EOF(mp, io, new_last_fsb, XFS_DATA_FORK, &eof); + if (error) { + return error; + } + if (eof) { + last_fsb = new_last_fsb; + } + } + + error = XFS_BMAPI(mp, NULL, io, offset_fsb, + (xfs_filblks_t)(last_fsb - offset_fsb), + XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | + XFS_BMAPI_ENTIRE, &firstblock, 1, imap, + &nimaps, NULL); + /* + * This can be EDQUOT, if nimaps == 0 + */ + if (error) { + return XFS_ERROR(error); + } + /* + * If bmapi returned us nothing, and if we didn't get back EDQUOT, + * then we must have run out of space. + */ + if (nimaps == 0) { +/* xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE, + io, offset, count); */ + return XFS_ERROR(ENOSPC); + } + + if (!(ioflag & PBF_SYNC) || + ((last_fsb - offset_fsb) >= mp->m_writeio_blocks)) { + /* + * For normal or large sync writes, align everything + * into i_writeio_blocks sized chunks. + */ + iosize = mp->m_writeio_blocks; + aligned_offset = XFS_WRITEIO_ALIGN(mp, offset); + ioalign = XFS_B_TO_FSBT(mp, aligned_offset); + small_write = 0; + /* XXX - Are we shrinking? XXXXX */ + } else { + /* + * For small sync writes try to minimize the amount + * of I/O we do. Round down and up to the larger of + * page or block boundaries. Set the small_write + * variable to 1 to indicate to the code below that + * we are not using the normal buffer alignment scheme. + */ + if (NBPP > mp->m_sb.sb_blocksize) { + aligned_offset = ctooff(offtoct(offset)); + ioalign = XFS_B_TO_FSBT(mp, aligned_offset); + last_page_offset = ctob64(btoc64(offset + count)); + iosize = XFS_B_TO_FSBT(mp, last_page_offset - + aligned_offset); + } else { + ioalign = offset_fsb; + iosize = last_fsb - offset_fsb; + } + small_write = 1; + /* XXX - Are we shrinking? XXXXX */ + } + + /* + * Now map our desired I/O size and alignment over the + * extents returned by xfs_bmapi(). + */ + xfs_write_bmap(mp, io, imap, pbmapp, iosize, ioalign, isize); + pbmapp->pbm_delta = offset - pbmapp->pbm_offset; + + ASSERT((pbmapp->pbm_bsize > 0) + && (pbmapp->pbm_bsize - pbmapp->pbm_delta > 0)); + + /* + * A bmap is the EOF bmap when it reaches to or beyond the new + * inode size. + */ + if ((pbmapp->pbm_offset + pbmapp->pbm_bsize ) >= isize) { + pbmapp->pbm_flags |= PBMF_EOF; + } + +/* xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, + io, offset, count, bmapp, imap); */ + + /* On IRIX, we walk more imaps filling in more bmaps. On Linux + just handle one for now. To find the code on IRIX, + look in xfs_iomap_write() in xfs_rw.c. */ + + *npbmaps = 1; + return 0; +} + +STATIC int +xfs_iomap_write_direct( + xfs_iocore_t *io, + loff_t offset, + size_t count, + pb_bmap_t *pbmapp, + int *npbmaps, + int ioflag, + int found) +{ + xfs_inode_t *ip = XFS_IO_INODE(io); + xfs_mount_t *mp; + xfs_fileoff_t offset_fsb; + xfs_fileoff_t last_fsb; + xfs_filblks_t count_fsb; + xfs_fsize_t isize; + xfs_fsblock_t firstfsb; + int nimaps, maps; + int error; + xfs_trans_t *tp; + +#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP + xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS], *imapp; + xfs_bmap_free_t free_list; + int aeof; + int bmapi_flags; + xfs_filblks_t datablocks; + int rt; + int committed; + int numrtextents; + uint resblks; + int rtextsize; + + maps = min(XFS_WRITE_IMAPS, *npbmaps); + nimaps = maps; + + mp = io->io_mount; + isize = XFS_SIZE(mp, io); + if (io->io_new_size > isize) + isize = io->io_new_size; + + if ((offset + count) > isize) { + aeof = 1; + } else { + aeof = 0; + } + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); + count_fsb = last_fsb - offset_fsb; + if (found && (pbmapp->pbm_flags & PBMF_HOLE)) { + xfs_fileoff_t map_last_fsb; + map_last_fsb = XFS_B_TO_FSB(mp, + (pbmapp->pbm_bsize + pbmapp->pbm_offset)); + + if (map_last_fsb < last_fsb) { + last_fsb = map_last_fsb; + count_fsb = last_fsb - offset_fsb; + } + ASSERT(count_fsb > 0); + } + + /* + * roundup the allocation request to m_dalign boundary if file size + * is greater that 512K and we are allocating past the allocation eof + */ + if (!found && mp->m_dalign && (isize >= 524288) && aeof) { + int eof; + xfs_fileoff_t new_last_fsb; + new_last_fsb = roundup_64(last_fsb, mp->m_dalign); + printk("xfs_iomap_write_direct: about to XFS_BMAP_EOF %Ld\n", + new_last_fsb); + error = XFS_BMAP_EOF(mp, io, new_last_fsb, XFS_DATA_FORK, &eof); + if (error) { + goto error_out; + } + if (eof) + last_fsb = new_last_fsb; + } + + bmapi_flags = XFS_BMAPI_WRITE|XFS_BMAPI_DIRECT_IO|XFS_BMAPI_ENTIRE; + bmapi_flags &= ~XFS_BMAPI_DIRECT_IO; + + /* + * determine if this is a realtime file + */ + if ((rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) != 0) { + rtextsize = mp->m_sb.sb_rextsize; + } else + rtextsize = 0; + + error = 0; + + /* + * allocate file space for the bmapp entries passed in. + */ + + /* + * determine if reserving space on + * the data or realtime partition. + */ + if (rt) { + numrtextents = (count_fsb + rtextsize - 1); + do_div(numrtextents, rtextsize); + datablocks = 0; + } else { + datablocks = count_fsb; + numrtextents = 0; + } + + /* + * allocate and setup the transaction + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + error = xfs_trans_reserve(tp, + resblks, + XFS_WRITE_LOG_RES(mp), + numrtextents, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + + /* + * check for running out of space + */ + if (error) { + /* + * Free the transaction structure. + */ + xfs_trans_cancel(tp, 0); + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + if (error) { + goto error_out; /* Don't return in above if .. trans .., + need lock to return */ + } + + if (XFS_IS_QUOTA_ON(mp)) { + if (xfs_trans_reserve_quota(tp, + ip->i_udquot, + ip->i_gdquot, + resblks, 0, 0)) { + error = (EDQUOT); + goto error1; + } + nimaps = 1; + } else { + nimaps = 2; + } + + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + + /* + * issue the bmapi() call to allocate the blocks + */ + XFS_BMAP_INIT(&free_list, &firstfsb); + imapp = &imap[0]; + error = XFS_BMAPI(mp, tp, io, offset_fsb, count_fsb, + bmapi_flags, &firstfsb, 1, imapp, &nimaps, &free_list); + if (error) { + goto error0; + } + + /* + * complete the transaction + */ + + error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed); + if (error) { + goto error0; + } + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + if (error) { + goto error_out; + } + + /* copy any maps to caller's array and return any error. */ + if (nimaps == 0) { + error = (ENOSPC); + goto error_out; + } + + maps = min(nimaps, maps); + *npbmaps = _xfs_imap_to_bmap(io, offset, &imap[0], pbmapp, maps, + *npbmaps); + if(*npbmaps) { + /* + * this is new since xfs_iomap_read + * didn't find it. + */ + if (*npbmaps != 1) { + printk("NEED MORE WORK FOR MULTIPLE BMAPS (which are new)\n"); + } + } + goto out; + + error0: /* Cancel bmap, unlock inode, and cancel trans */ + xfs_bmap_cancel(&free_list); + + error1: /* Just cancel transaction */ + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + *npbmaps = 0; /* nothing set-up here */ + +error_out: +out: /* Just return error and any tracing at end of routine */ + return XFS_ERROR(error); +} + + +/* + * All xfs metadata buffers except log state machine buffers + * get this attached as their b_bdstrat callback function. + * This is so that we can catch a buffer + * after prematurely unpinning it to forcibly shutdown the filesystem. + */ +int +xfs_bdstrat_cb(struct xfs_buf *bp) +{ + xfs_mount_t *mp; + + mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *); + if (!XFS_FORCED_SHUTDOWN(mp)) { + pagebuf_iorequest(bp); + return 0; + } else { + xfs_buftrace("XFS__BDSTRAT IOERROR", bp); + /* + * Metadata write that didn't get logged but + * written delayed anyway. These aren't associated + * with a transaction, and can be ignored. + */ + if (XFS_BUF_IODONE_FUNC(bp) == NULL && + (XFS_BUF_ISREAD(bp)) == 0) + return (xfs_bioerror_relse(bp)); + else + return (xfs_bioerror(bp)); + } +} +/* + * Wrapper around bdstrat so that we can stop data + * from going to disk in case we are shutting down the filesystem. + * Typically user data goes thru this path; one of the exceptions + * is the superblock. + */ +int +xfsbdstrat( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + ASSERT(mp); + if (!XFS_FORCED_SHUTDOWN(mp)) { + /* Grio redirection would go here + * if (XFS_BUF_IS_GRIO(bp)) { + */ + + pagebuf_iorequest(bp); + return 0; + } + + xfs_buftrace("XFSBDSTRAT IOERROR", bp); + return (xfs_bioerror_relse(bp)); +} + + +void +XFS_bflush(xfs_buftarg_t *target) +{ + pagebuf_delwri_flush(target, PBDF_WAIT, NULL); +} + + +/* Push all fs state out to disk + */ + +void +XFS_log_write_unmount_ro(bhv_desc_t *bdp) +{ + xfs_mount_t *mp; + int pincount = 0; + int count = 0; + int error; + + mp = XFS_BHVTOM(bdp); + xfs_refcache_purge_mp(mp); + xfs_binval(mp->m_ddev_targp); + + do { + xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); + VFS_SYNC(XFS_MTOVFS(mp), SYNC_ATTR|SYNC_WAIT, NULL, error); + pagebuf_delwri_flush(mp->m_ddev_targp, + PBDF_WAIT, &pincount); + if (pincount == 0) {delay(50); count++;} + } while (count < 2); + + /* Ok now write out an unmount record */ + xfs_log_unmount_write(mp); + xfs_unmountfs_writesb(mp); +} + +/* + * In these two situations we disregard the readonly mount flag and + * temporarily enable writes (we must, to ensure metadata integrity). + */ +STATIC int +xfs_is_read_only(xfs_mount_t *mp) +{ + if (is_read_only(mp->m_dev) || is_read_only(mp->m_logdev_targp->pbr_dev)) { + cmn_err(CE_NOTE, + "XFS: write access unavailable, cannot proceed."); + return EROFS; + } + cmn_err(CE_NOTE, + "XFS: write access will be enabled during mount."); + XFS_MTOVFS(mp)->vfs_flag &= ~VFS_RDONLY; + return 0; +} + +int +xfs_recover_read_only(xlog_t *log) +{ + cmn_err(CE_NOTE, "XFS: WARNING: " + "recovery required on readonly filesystem."); + return xfs_is_read_only(log->l_mp); +} + +int +xfs_quotacheck_read_only(xfs_mount_t *mp) +{ + cmn_err(CE_NOTE, "XFS: WARNING: " + "quotacheck required on readonly filesystem."); + return xfs_is_read_only(mp); +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_lrw.h linux-2.4-xfs/fs/xfs/linux/xfs_lrw.h --- linux-2.4.19/fs/xfs/linux/xfs_lrw.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_lrw.h Thu Jul 18 22:33:49 2002 @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#ifndef __XFS_LRW_H__ +#define __XFS_LRW_H__ + +#define XFS_IOMAP_READ_ENTER 3 +/* + * Maximum count of bmaps used by read and write paths. + */ +#define XFS_MAX_RW_NBMAPS 4 + +extern int xfs_bmap (bhv_desc_t *, xfs_off_t, ssize_t, int, struct cred *, pb_bmap_t *, int *); +extern int xfs_strategy (bhv_desc_t *, xfs_off_t, ssize_t, int, struct cred *, pb_bmap_t *, int *); +extern int xfsbdstrat (struct xfs_mount *, struct xfs_buf *); +extern int xfs_bdstrat_cb (struct xfs_buf *); + +extern int xfs_zero_eof (vnode_t *, struct xfs_iocore *, xfs_off_t, + xfs_fsize_t, xfs_fsize_t, struct pm *); +extern ssize_t xfs_read ( + struct bhv_desc *bdp, + struct file *file, + char *buf, + size_t size, + loff_t *offset, + struct cred *credp); + +extern ssize_t xfs_write ( + struct bhv_desc *bdp, + struct file *file, + const char *buf, + size_t size, + loff_t *offset, + struct cred *credp); + +extern int xfs_recover_read_only (xlog_t *); +extern int xfs_quotacheck_read_only (xfs_mount_t *); + +extern void XFS_log_write_unmount_ro (bhv_desc_t *); + +#define XFS_FSB_TO_DB_IO(io,fsb) \ + (((io)->io_flags & XFS_IOCORE_RT) ? \ + XFS_FSB_TO_BB((io)->io_mount, (fsb)) : \ + XFS_FSB_TO_DADDR((io)->io_mount, (fsb))) + +#endif /* __XFS_LRW_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_seq.c linux-2.4-xfs/fs/xfs/linux/xfs_seq.c --- linux-2.4.19/fs/xfs/linux/xfs_seq.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_seq.c Mon Aug 26 15:23:20 2002 @@ -0,0 +1,31 @@ + +static void *stats_start(struct seq_file *s, loff_t *pos) +{ + if (*pos >= ARRAY_SIZE(xstats)) + return NULL; + return xstats + *pos; +} + +static void *stats_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return stats_start(m, pos); +} + +static void stats_stop(struct seq_file *m, void *v) +{ + +} + +static int stats_show(struct seq_file *m, void *v) +{ + struct xstats_entry *e = v; + + seq_printf(m, xstats[i].desc); + + for (i = e->start; i < e->end; i++) + seq_printf(m, " %u", *(((__u32*)&xfsstats) + i)); + + + +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_stats.c linux-2.4-xfs/fs/xfs/linux/xfs_stats.c --- linux-2.4.19/fs/xfs/linux/xfs_stats.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_stats.c Fri Aug 30 11:08:18 2002 @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include +#include + +static int +xfs_read_xfsstats(char *buffer, char **start, off_t offset, + int count, int *eof, void *data) +{ + int i, j, len; + static struct xstats_entry { + char *desc; + int endpoint; + } xstats[] = { + { "extent_alloc", XFSSTAT_END_EXTENT_ALLOC }, + { "abt", XFSSTAT_END_ALLOC_BTREE }, + { "blk_map", XFSSTAT_END_BLOCK_MAPPING }, + { "bmbt", XFSSTAT_END_BLOCK_MAP_BTREE }, + { "dir", XFSSTAT_END_DIRECTORY_OPS }, + { "trans", XFSSTAT_END_TRANSACTIONS }, + { "ig", XFSSTAT_END_INODE_OPS }, + { "log", XFSSTAT_END_LOG_OPS }, + { "push_ail", XFSSTAT_END_TAIL_PUSHING }, + { "xstrat", XFSSTAT_END_WRITE_CONVERT }, + { "rw", XFSSTAT_END_READ_WRITE_OPS }, + { "attr", XFSSTAT_END_ATTRIBUTE_OPS }, + { "qm", XFSSTAT_END_QUOTA_OPS }, + { "icluster", XFSSTAT_END_INODE_CLUSTER }, + { "vnodes", XFSSTAT_END_VNODE_OPS }, + }; + + for (i=j=len = 0; i < sizeof(xstats)/sizeof(struct xstats_entry); i++) { + len += sprintf(buffer + len, xstats[i].desc); + /* inner loop does each group */ + while (j < xstats[i].endpoint) { + len += sprintf(buffer + len, " %u", + *(((__u32*)&xfsstats) + j)); + j++; + } + buffer[len++] = '\n'; + } + /* extra precision counters */ + len += sprintf(buffer + len, "xpc %Lu %Lu %Lu\n", + xfsstats.xs_xstrat_bytes, + xfsstats.xs_write_bytes, + xfsstats.xs_read_bytes); + + if (offset >= len) { + *start = buffer; + *eof = 1; + return 0; + } + *start = buffer + offset; + if ((len -= offset) > count) + return count; + *eof = 1; + + return len; +} + +static int +xfs_read_xfsquota(char *buffer, char **start, off_t offset, + int count, int *eof, void *data) +{ + int len; + + /* maximum; incore; ratio free to inuse; freelist */ + len = sprintf(buffer, "%d\t%d\t%d\t%u\n", + ndquot, + xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0, + xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0, + xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0); + + if (offset >= len) { + *start = buffer; + *eof = 1; + return 0; + } + *start = buffer + offset; + if ((len -= offset) > count) + return count; + *eof = 1; + + return len; +} + +void +xfs_init_procfs(void) +{ + if (!proc_mkdir("fs/xfs", 0)) + return; + create_proc_read_entry("fs/xfs/stat", 0, 0, xfs_read_xfsstats, NULL); + create_proc_read_entry("fs/xfs/xqm", 0, 0, xfs_read_xfsquota, NULL); +} + +void +xfs_cleanup_procfs(void) +{ + remove_proc_entry("fs/xfs/stat", NULL); + remove_proc_entry("fs/xfs/xqm", NULL); + remove_proc_entry("fs/xfs", NULL); +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_stats.h linux-2.4-xfs/fs/xfs/linux/xfs_stats.h --- linux-2.4.19/fs/xfs/linux/xfs_stats.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_stats.h Fri Aug 30 11:08:18 2002 @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_STATS_H__ +#define __XFS_STATS_H__ + +/* + * procfs interface + */ +#ifdef CONFIG_PROC_FS +extern void xfs_init_procfs(void); +extern void xfs_cleanup_procfs(void); +#else +static __inline void xfs_init_procfs(void) { }; +static __inline void xfs_cleanup_procfs(void) { }; +#endif + +#endif /* __XFS_STATS_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_super.c linux-2.4-xfs/fs/xfs/linux/xfs_super.c --- linux-2.4.19/fs/xfs/linux/xfs_super.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_super.c Thu Sep 5 15:35:08 2002 @@ -0,0 +1,935 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include +#include +#include +#include +#include +#include +#include "xfs_version.h" + +/* xfs_vfs[ops].c */ +extern int xfs_init(void); +extern void xfs_cleanup(void); + +/* For kernels which have the s_maxbytes field - set it */ +#ifdef MAX_NON_LFS +# define set_max_bytes(sb) ((sb)->s_maxbytes = XFS_MAX_FILE_OFFSET) +#else +# define set_max_bytes(sb) do { } while (0) +#endif + +#ifdef CONFIG_FS_POSIX_ACL +# define set_posix_acl(sb) ((sb)->s_flags |= MS_POSIXACL) +#else +# define set_posix_acl(sb) do { } while (0) +#endif + +#ifdef CONFIG_XFS_QUOTA +static struct quotactl_ops linvfs_qops = { + .get_xstate = linvfs_getxstate, + .set_xstate = linvfs_setxstate, + .get_xquota = linvfs_getxquota, + .set_xquota = linvfs_setxquota, +}; +# define set_quota_ops(sb) ((sb)->s_qcop = &linvfs_qops) +#else +# define set_quota_ops(sb) do { } while (0) +#endif + +#ifdef CONFIG_XFS_DMAPI +int dmapi_init(void); +void dmapi_uninit(void); +#else +#define dmapi_init() +#define dmapi_uninit() +#endif + +static struct super_operations linvfs_sops; + +#define MNTOPT_LOGBUFS "logbufs" /* number of XFS log buffers */ +#define MNTOPT_LOGBSIZE "logbsize" /* size of XFS log buffers */ +#define MNTOPT_LOGDEV "logdev" /* log device */ +#define MNTOPT_RTDEV "rtdev" /* realtime I/O device */ +#define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */ +#define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */ +#define MNTOPT_BIOSIZE "biosize" /* log2 of preferred buffered io size */ +#define MNTOPT_WSYNC "wsync" /* safe-mode nfs compatible mount */ +#define MNTOPT_INO64 "ino64" /* force inodes into 64-bit range */ +#define MNTOPT_NOALIGN "noalign" /* turn off stripe alignment */ +#define MNTOPT_SUNIT "sunit" /* data volume stripe unit */ +#define MNTOPT_SWIDTH "swidth" /* data volume stripe width */ +#define MNTOPT_NORECOVERY "norecovery" /* don't run XFS recovery */ +#define MNTOPT_OSYNCISDSYNC "osyncisdsync" /* o_sync == o_dsync on this fs */ + /* (this is now the default!) */ +#define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */ +#define MNTOPT_QUOTA "quota" /* disk quotas */ +#define MNTOPT_MRQUOTA "mrquota" /* don't turnoff if SB has quotas on */ +#define MNTOPT_NOQUOTA "noquota" /* no quotas */ +#define MNTOPT_UQUOTA "usrquota" /* user quota enabled */ +#define MNTOPT_GQUOTA "grpquota" /* group quota enabled */ +#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */ +#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ +#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ +#define MNTOPT_NOUUID "nouuid" /* Ignore FS uuid */ +#define MNTOPT_IRIXSGID "irixsgid" /* Irix-style sgid inheritance */ +#define MNTOPT_NOLOGFLUSH "nologflush" /* Don't use hard flushes in + log writing */ +#define MNTOPT_MTPT "mtpt" /* filesystem mount point */ + +STATIC int +xfs_parseargs( + char *options, + int flags, + struct xfs_mount_args *args) +{ + char *this_char, *value, *eov; + int logbufs = -1; + int logbufsize = -1; + int dsunit, dswidth, vol_dsunit, vol_dswidth; + int iosize; + int rval = 1; /* failure is default */ + + iosize = dsunit = dswidth = vol_dsunit = vol_dswidth = 0; + memset(args, 0, sizeof(struct xfs_mount_args)); + args->slcount = args->stimeout = args->ctimeout = -1; + args->mtpt[0] = '\0'; + + /* Copy the already-parsed mount(2) flags we're interested in */ + if (flags & MS_NOATIME) + args->flags |= XFSMNT_NOATIME; + + if (!options) { + args->logbufs = logbufs; + args->logbufsize = logbufsize; + return 0; + } + + while ((this_char = strsep (&options, ",")) != NULL) { + if (!*this_char) + continue; + if ((value = strchr (this_char, '=')) != NULL) + *value++ = 0; + + if (!strcmp(this_char, MNTOPT_LOGBUFS)) { + logbufs = simple_strtoul(value, &eov, 10); + } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) { + int in_kilobytes = 0; + + if (toupper(value[strlen(value)-1]) == 'K') { + in_kilobytes = 1; + value[strlen(value)-1] = '\0'; + } + logbufsize = simple_strtoul(value, &eov, 10); + if (in_kilobytes) + logbufsize = logbufsize * 1024; + } else if (!strcmp(this_char, MNTOPT_LOGDEV)) { + strncpy(args->logname, value, MAXNAMELEN); + } else if (!strcmp(this_char, MNTOPT_MTPT)) { + strncpy(args->mtpt, value, MAXNAMELEN); +#if CONFIG_XFS_DMAPI + } else if (!strcmp(this_char, MNTOPT_DMAPI)) { + args->flags |= XFSMNT_DMAPI; + } else if (!strcmp(this_char, MNTOPT_XDSM)) { + args->flags |= XFSMNT_DMAPI; +#else + } else if (!strcmp(this_char, MNTOPT_DMAPI) || + !strcmp(this_char, MNTOPT_XDSM)) { + printk("XFS: this kernel does not support dmapi/xdsm.\n"); + return rval; +#endif + } else if (!strcmp(this_char, MNTOPT_RTDEV)) { + strncpy(args->rtname, value, MAXNAMELEN); + } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) { + iosize = simple_strtoul(value, &eov, 10); + args->flags |= XFSMNT_IOSIZE; + args->iosizelog = (uint8_t) iosize; + } else if (!strcmp(this_char, MNTOPT_WSYNC)) { + args->flags |= XFSMNT_WSYNC; + } else if (!strcmp(this_char, MNTOPT_OSYNCISDSYNC)) { + /* no-op, this is now the default */ +printk("XFS: osyncisdsync is now the default, and will soon be deprecated.\n"); + } else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) { + args->flags |= XFSMNT_OSYNCISOSYNC; + } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) { + args->flags |= XFSMNT_NORECOVERY; + } else if (!strcmp(this_char, MNTOPT_INO64)) { +#ifdef XFS_BIG_FILESYSTEMS + args->flags |= XFSMNT_INO64; +#else + printk("XFS: ino64 option not allowed on this system\n"); + return rval; +#endif + } else if (!strcmp(this_char, MNTOPT_UQUOTA)) { + args->flags |= XFSMNT_UQUOTA | XFSMNT_UQUOTAENF; + } else if (!strcmp(this_char, MNTOPT_QUOTA)) { + args->flags |= XFSMNT_UQUOTA | XFSMNT_UQUOTAENF; + } else if (!strcmp(this_char, MNTOPT_UQUOTANOENF)) { + args->flags |= XFSMNT_UQUOTA; + args->flags &= ~XFSMNT_UQUOTAENF; + } else if (!strcmp(this_char, MNTOPT_QUOTANOENF)) { + args->flags |= XFSMNT_UQUOTA; + args->flags &= ~XFSMNT_UQUOTAENF; + } else if (!strcmp(this_char, MNTOPT_MRQUOTA)) { + args->flags |= XFSMNT_QUOTAMAYBE; + } else if (!strcmp(this_char, MNTOPT_GQUOTA)) { + args->flags |= XFSMNT_GQUOTA | XFSMNT_GQUOTAENF; + } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { + args->flags |= XFSMNT_GQUOTA; + args->flags &= ~XFSMNT_GQUOTAENF; + } else if (!strcmp(this_char, MNTOPT_NOALIGN)) { + args->flags |= XFSMNT_NOALIGN; + } else if (!strcmp(this_char, MNTOPT_SUNIT)) { + dsunit = simple_strtoul(value, &eov, 10); + } else if (!strcmp(this_char, MNTOPT_SWIDTH)) { + dswidth = simple_strtoul(value, &eov, 10); + } else if (!strcmp(this_char, MNTOPT_NOUUID)) { + args->flags |= XFSMNT_NOUUID; + } else if (!strcmp(this_char, MNTOPT_IRIXSGID)) { + args->flags |= XFSMNT_IRIXSGID; + } else if (!strcmp(this_char, MNTOPT_NOLOGFLUSH)) { + args->flags |= XFSMNT_NOLOGFLUSH; + } else { + printk("XFS: unknown mount option [%s].\n", this_char); + return rval; + } + } + + if (args->flags & XFSMNT_NORECOVERY) { + if ((flags & MS_RDONLY) == 0) { + printk("XFS: no-recovery mounts must be read-only.\n"); + return rval; + } + } + + if ((args->flags & XFSMNT_NOALIGN) && (dsunit || dswidth)) { + printk( + "XFS: sunit and swidth options incompatible with the noalign option\n"); + return rval; + } + + if ((dsunit && !dswidth) || (!dsunit && dswidth)) { + printk("XFS: sunit and swidth must be specified together\n"); + return rval; + } + + if (dsunit && (dswidth % dsunit != 0)) { + printk( + "XFS: stripe width (%d) must be a multiple of the stripe unit (%d)\n", + dswidth, dsunit); + return rval; + } + + if ((args->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) { + if (dsunit) { + args->sunit = dsunit; + args->flags |= XFSMNT_RETERR; + } else + args->sunit = vol_dsunit; + dswidth ? (args->swidth = dswidth) : + (args->swidth = vol_dswidth); + } else + args->sunit = args->swidth = 0; + + args->logbufs = logbufs; + args->logbufsize = logbufsize; + + return 0; +} + +/* + * Convert one device special file to a dev_t. + * Helper routine, used only by spectodevs below. + */ +STATIC int +spectodev( + const char *name, + const char *id, + dev_t *dev) +{ + struct nameidata nd; + int rval = 0; + + if (path_init(name, LOOKUP_FOLLOW, &nd)) + rval = path_walk(name, &nd); + /* Watch out for negative dentries */ + if (!nd.dentry->d_inode) + rval = -ENOENT; + if (rval) + printk("XFS: Invalid %s device [%s], err=%d\n", id, name, rval); + else + *dev = nd.dentry->d_inode->i_rdev; + path_release(&nd); + return rval; +} + +/* + * Convert device special files to dev_t for data, log, realtime. + */ +int +spectodevs( + struct super_block *sb, + struct xfs_mount_args *args, + dev_t *ddevp, + dev_t *logdevp, + dev_t *rtdevp) +{ + int rval = 0; + + *ddevp = sb->s_dev; + + if (args->logname[0]) + rval = spectodev(args->logname, "log", logdevp); + else + *logdevp = sb->s_dev; + + if (args->rtname[0] && !rval) + rval = spectodev(args->rtname, "realtime", rtdevp); + else + *rtdevp = 0; + return rval; +} + + +static kmem_cache_t * linvfs_inode_cachep; + +static __inline__ unsigned int gfp_mask(void) +{ + /* If we're not in a transaction, FS activity is ok */ + if (current->flags & PF_FSTRANS) return GFP_NOFS; + return GFP_KERNEL; +} + + +static struct inode *linvfs_alloc_inode(struct super_block *sb) +{ + vnode_t *vp; + + vp = (vnode_t *)kmem_cache_alloc(linvfs_inode_cachep, gfp_mask()); + if (!vp) + return NULL; + return LINVFS_GET_IP(vp); +} + +static void linvfs_destroy_inode(struct inode *inode) +{ + kmem_cache_free(linvfs_inode_cachep, LINVFS_GET_VP(inode)); +} + +static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) +{ + vnode_t *vp = (vnode_t *)foo; + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + inode_init_once(LINVFS_GET_IP(vp)); +} + +static int init_inodecache(void) +{ + linvfs_inode_cachep = kmem_cache_create("linvfs_icache", + sizeof(vnode_t), 0, SLAB_HWCACHE_ALIGN, + init_once, NULL); + + if (linvfs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + if (kmem_cache_destroy(linvfs_inode_cachep)) + printk(KERN_INFO "linvfs_inode_cache: not all structures were freed\n"); +} + +struct super_block * +linvfs_read_super( + struct super_block *sb, + void *data, + int silent) +{ + vfs_t *vfsp; + vfsops_t *vfsops; + vnode_t *rootvp; + struct inode *ip; + struct xfs_mount_args *args; + struct statfs statvfs; + int error; + + args = (struct xfs_mount_args *)kmalloc(sizeof(struct xfs_mount_args), GFP_KERNEL); + if (!args) + return NULL; + + if (xfs_parseargs((char *)data, sb->s_flags, args)) + goto out_null; + strncpy(args->fsname, bdevname(sb->s_dev), MAXNAMELEN); + /* args->rtdev and args->logdev done in xfs_parseargs */ + + /* Kludge in XFS until we have other VFS/VNODE FSs */ + vfsops = &xfs_vfsops; + + /* Set up the vfs_t structure */ + vfsp = vfs_allocate(); + if (!vfsp) + goto out_null; + + if (sb->s_flags & MS_RDONLY) + vfsp->vfs_flag |= VFS_RDONLY; + + vfsp->vfs_super = sb; + set_blocksize(sb->s_dev, BBSIZE); + set_max_bytes(sb); + set_quota_ops(sb); + sb->s_op = &linvfs_sops; + + LINVFS_SET_VFS(sb, vfsp); + + VFSOPS_MOUNT(vfsops, vfsp, args, NULL, error); + if (error) + goto fail_vfsop; + + VFS_STATVFS(vfsp, &statvfs, NULL, error); + if (error) + goto fail_unmount; + + sb->s_magic = XFS_SB_MAGIC; + sb->s_dirt = 1; + sb->s_blocksize = statvfs.f_bsize; + sb->s_blocksize_bits = ffs(statvfs.f_bsize) - 1; + + VFS_ROOT(vfsp, &rootvp, error); + if (error) + goto fail_unmount; + + ip = LINVFS_GET_IP(rootvp); + linvfs_revalidate_core(ip, ATTR_COMM); + + sb->s_root = d_alloc_root(ip); + if (!sb->s_root) + goto fail_vnrele; + if (is_bad_inode(sb->s_root->d_inode)) + goto fail_vnrele; + + /* Don't set the VFS_DMI flag until here because we don't want + * to send events while replaying the log. + */ + if (args->flags & XFSMNT_DMAPI) { + vfsp->vfs_flag |= VFS_DMI; + VFSOPS_DMAPI_MOUNT(vfsops, vfsp, args->mtpt, args->fsname, + error); + if (error) { + if (atomic_read(&sb->s_active) == 1) + vfsp->vfs_flag &= ~VFS_DMI; + goto fail_vnrele; + } + } + set_posix_acl(sb); + + vn_trace_exit(rootvp, "linvfs_read_super", (inst_t *)__return_address); + + kfree(args); + return(sb); + +fail_vnrele: + if (sb->s_root) { + dput(sb->s_root); + sb->s_root = NULL; + } else { + VN_RELE(rootvp); + } + +fail_unmount: + VFS_UNMOUNT(vfsp, 0, NULL, error); + +fail_vfsop: + vfs_deallocate(vfsp); + +out_null: + kfree(args); + return(NULL); +} + +void +linvfs_set_inode_ops( + struct inode *inode) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + + inode->i_mode = VTTOIF(vp->v_type); + + /* If this isn't a new inode, nothing to do */ + if (!(inode->i_state & I_NEW)) + return; + + if (vp->v_type == VNON) { + make_bad_inode(inode); + } else if (S_ISREG(inode->i_mode)) { + inode->i_op = &linvfs_file_inode_operations; + inode->i_fop = &linvfs_file_operations; + inode->i_mapping->a_ops = &linvfs_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &linvfs_dir_inode_operations; + inode->i_fop = &linvfs_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &linvfs_symlink_inode_operations; + if (inode->i_blocks) + inode->i_mapping->a_ops = &linvfs_aops; + } else { + inode->i_op = &linvfs_file_inode_operations; + init_special_inode(inode, inode->i_mode, + kdev_t_to_nr(inode->i_rdev)); + } + + unlock_new_inode(inode); +} + +/* + * We do not actually write the inode here, just mark the + * super block dirty so that sync_supers calls us and + * forces the flush. + */ +void +linvfs_write_inode( + struct inode *inode, + int sync) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + int error, flags = FLUSH_INODE; + + if (vp) { + vn_trace_entry(vp, "linvfs_write_inode", + (inst_t *)__return_address); + + if (sync) + flags |= FLUSH_SYNC; + VOP_IFLUSH(vp, flags, error); + if (error == EAGAIN) + inode->i_sb->s_dirt = 1; + } +} + +void +linvfs_clear_inode( + struct inode *inode) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + + if (vp) { + vn_rele(vp); + vn_trace_entry(vp, "linvfs_clear_inode", + (inst_t *)__return_address); + /* + * Do all our cleanup, and remove this vnode. + */ + vp->v_flag |= VPURGE; + vn_remove(vp); + } +} + +void +linvfs_put_inode( + struct inode *ip) +{ + vnode_t *vp = LINVFS_GET_VP(ip); + int error; + + if (vp && vp->v_fbhv && (atomic_read(&ip->i_count) == 1)) + VOP_RELEASE(vp, error); +} + +void +linvfs_put_super( + struct super_block *sb) +{ + int error; + int sector_size = BBSIZE; + kdev_t dev = sb->s_dev; + vfs_t *vfsp = LINVFS_GET_VFS(sb); + + VFS_DOUNMOUNT(vfsp, 0, NULL, NULL, error); + if (error) { + printk("XFS unmount got error %d\n", error); + printk("linvfs_put_super: vfsp/0x%p left dangling!\n", vfsp); + return; + } + + vfs_deallocate(vfsp); + + /* Reset device block size */ + if (hardsect_size[MAJOR(dev)]) + sector_size = hardsect_size[MAJOR(dev)][MINOR(dev)]; + set_blocksize(dev, sector_size); +} + +void +linvfs_write_super( + struct super_block *sb) +{ + vfs_t *vfsp = LINVFS_GET_VFS(sb); + int error; + + sb->s_dirt = 0; + if (sb->s_flags & MS_RDONLY) + return; + VFS_SYNC(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, + NULL, error); +} + +int +linvfs_statfs( + struct super_block *sb, + struct statfs *statp) +{ + vfs_t *vfsp = LINVFS_GET_VFS(sb); + int error; + + VFS_STATVFS(vfsp, statp, NULL, error); + + return error; +} + +int +linvfs_remount( + struct super_block *sb, + int *flags, + char *options) +{ + struct xfs_mount_args *args; + vfs_t *vfsp; + xfs_mount_t *mp; + int error = 0; + + args = (struct xfs_mount_args *)kmalloc(sizeof(struct xfs_mount_args), GFP_KERNEL); + if (!args) + return -ENOMEM; + + vfsp = LINVFS_GET_VFS(sb); + mp = XFS_BHVTOM(vfsp->vfs_fbhv); + + if (xfs_parseargs(options, *flags, args)) { + error = -EINVAL; + goto out; + } + set_posix_acl(sb); + + if (args->flags & XFSMNT_NOATIME) + mp->m_flags |= XFS_MOUNT_NOATIME; + else + mp->m_flags &= ~XFS_MOUNT_NOATIME; + + linvfs_write_super(sb); + + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + goto out; + + if (*flags & MS_RDONLY) { + sb->s_flags |= MS_RDONLY; + XFS_log_write_unmount_ro(vfsp->vfs_fbhv); + vfsp->vfs_flag |= VFS_RDONLY; + } else { + vfsp->vfs_flag &= ~VFS_RDONLY; + } + +out: + kfree(args); + return error; +} + +void +linvfs_freeze_fs( + struct super_block *sb) +{ + vfs_t *vfsp; + vnode_t *vp; + int error; + + vfsp = LINVFS_GET_VFS(sb); + if (sb->s_flags & MS_RDONLY) + return; + VFS_ROOT(vfsp, &vp, error); + VOP_IOCTL(vp, LINVFS_GET_IP(vp), NULL, XFS_IOC_FREEZE, 0, error); + VN_RELE(vp); +} + +void +linvfs_unfreeze_fs( + struct super_block *sb) +{ + vfs_t *vfsp; + vnode_t *vp; + int error; + + vfsp = LINVFS_GET_VFS(sb); + VFS_ROOT(vfsp, &vp, error); + VOP_IOCTL(vp, LINVFS_GET_IP(vp), NULL, XFS_IOC_THAW, 0, error); + VN_RELE(vp); +} + + +STATIC int linvfs_dentry_to_fh( + struct dentry *dentry, + __u32 *data, + int *lenp, + int need_parent) +{ + struct inode *inode = dentry->d_inode ; + vnode_t *vp = LINVFS_GET_VP(inode); + int maxlen = *lenp; + xfs_fid2_t fid; + int error; + + if (maxlen < 3) + return 255 ; + + VOP_FID2(vp, (struct fid *)&fid, error); + data[0] = (__u32)fid.fid_ino; /* 32 bits of inode is OK */ + data[1] = fid.fid_gen; + + *lenp = 2 ; + if (maxlen < 4 || ! need_parent) + return 2 ; + + inode = dentry->d_parent->d_inode ; + vp = LINVFS_GET_VP(inode); + + VOP_FID2(vp, (struct fid *)&fid, error); + data[2] = (__u32)fid.fid_ino; /* 32 bits of inode is OK */ + *lenp = 3 ; + if (maxlen < 4) + return 3 ; + data[3] = fid.fid_gen; + *lenp = 4 ; + return 4 ; +} + +STATIC struct dentry *linvfs_fh_to_dentry( + struct super_block *sb, + __u32 *data, + int len, + int fhtype, + int parent) +{ + vnode_t *vp; + struct inode *inode = NULL; + struct list_head *lp; + struct dentry *result; + xfs_fid2_t xfid; + vfs_t *vfsp = LINVFS_GET_VFS(sb); + int error; + + xfid.fid_len = sizeof(xfs_fid2_t) - sizeof(xfid.fid_len); + xfid.fid_pad = 0; + + if (!parent) { + xfid.fid_gen = data[1]; + xfid.fid_ino = (__u64)data[0]; + } else { + if (fhtype == 4) + xfid.fid_gen = data[3]; + else + xfid.fid_gen = 0; + xfid.fid_ino = (__u64)data[2]; + } + + VFS_VGET(vfsp, &vp, (fid_t *)&xfid, error); + if (error || vp == NULL) + return ERR_PTR(-ESTALE) ; + + inode = LINVFS_GET_IP(vp); + spin_lock(&dcache_lock); + for (lp = inode->i_dentry.next; lp != &inode->i_dentry ; lp=lp->next) { + result = list_entry(lp,struct dentry, d_alias); + if (! (result->d_flags & DCACHE_NFSD_DISCONNECTED)) { + dget_locked(result); + result->d_vfs_flags |= DCACHE_REFERENCED; + spin_unlock(&dcache_lock); + iput(inode); + return result; + } + } + spin_unlock(&dcache_lock); + result = d_alloc_root(inode); + if (result == NULL) { + iput(inode); + return ERR_PTR(-ENOMEM); + } + result->d_flags |= DCACHE_NFSD_DISCONNECTED; + return result; +} + +int +linvfs_show_options(struct seq_file *m, struct vfsmount *mnt) +{ + vfs_t *vfsp; + xfs_mount_t *mp; + static struct proc_xfs_info { + int flag; + char *str; + } xfs_info[] = { + /* the few simple ones we can get from the mount struct */ + { XFS_MOUNT_NOALIGN, ",noalign" }, + { XFS_MOUNT_NORECOVERY, ",norecovery" }, + { XFS_MOUNT_OSYNCISOSYNC, ",osyncisosync" }, + { XFS_MOUNT_NOUUID, ",nouuid" }, + { XFS_MOUNT_IRIXSGID, ",irixsgid" }, + { 0, NULL } + }; + + struct proc_xfs_info *xfs_infop; + + vfsp = LINVFS_GET_VFS(mnt->mnt_sb); + mp = XFS_BHVTOM(vfsp->vfs_fbhv); + + for (xfs_infop = xfs_info; xfs_infop->flag; xfs_infop++) { + if (mp->m_flags & xfs_infop->flag) + seq_puts(m, xfs_infop->str); + } + + if (mp->m_qflags & XFS_UQUOTA_ACCT) { + seq_puts(m, ",uquota"); + if (!(mp->m_qflags & XFS_UQUOTA_ENFD)) + seq_puts(m, ",uqnoenforce"); + } + + if (mp->m_qflags & XFS_GQUOTA_ACCT) { + seq_puts(m, ",gquota"); + if (!(mp->m_qflags & XFS_GQUOTA_ENFD)) + seq_puts(m, ",gqnoenforce"); + } + + if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) + seq_printf(m, ",biosize=%d", mp->m_writeio_log); + + if (mp->m_logbufs > 0) + seq_printf(m, ",logbufs=%d", mp->m_logbufs); + + if (mp->m_logbsize > 0) + seq_printf(m, ",logbsize=%d", mp->m_logbsize); + + if (mp->m_ddev_targp->pbr_dev != mp->m_logdev_targp->pbr_dev) + seq_printf(m, ",logdev=%s", + bdevname(mp->m_logdev_targp->pbr_dev)); + + if (mp->m_rtdev_targp && + mp->m_ddev_targp->pbr_dev != mp->m_rtdev_targp->pbr_dev) + seq_printf(m, ",rtdev=%s", + bdevname(mp->m_rtdev_targp->pbr_dev)); + + if (mp->m_dalign > 0) + seq_printf(m, ",sunit=%d", + (int)XFS_FSB_TO_BB(mp, mp->m_dalign)); + + if (mp->m_swidth > 0) + seq_printf(m, ",swidth=%d", + (int)XFS_FSB_TO_BB(mp, mp->m_swidth)); + + if (vfsp->vfs_flag & VFS_DMI) + seq_puts(m, ",dmapi"); + + return 0; +} + +static struct super_operations linvfs_sops = { + .alloc_inode = linvfs_alloc_inode, + .destroy_inode = linvfs_destroy_inode, + .write_inode = linvfs_write_inode, + .put_inode = linvfs_put_inode, + .clear_inode = linvfs_clear_inode, + .put_super = linvfs_put_super, + .write_super = linvfs_write_super, + .write_super_lockfs = linvfs_freeze_fs, + .unlockfs = linvfs_unfreeze_fs, + .statfs = linvfs_statfs, + .remount_fs = linvfs_remount, + .fh_to_dentry = linvfs_fh_to_dentry, + .dentry_to_fh = linvfs_dentry_to_fh, + .show_options = linvfs_show_options, +}; + +static struct file_system_type xfs_fs_type = { + .owner = THIS_MODULE, + .name = "xfs", + .read_super = linvfs_read_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init init_xfs_fs(void) +{ + int error; + struct sysinfo si; + static char message[] __initdata = + KERN_INFO "SGI XFS " XFS_VERSION_STRING " with " + XFS_BUILD_OPTIONS " enabled\n"; + + error = init_inodecache(); + if (error < 0) + return error; + + error = pagebuf_init(); + if (error < 0) + goto out; + + si_meminfo(&si); + xfs_physmem = si.totalram; + + printk(message); + + vn_init(); + xfs_init(); + dmapi_init(); + + error = register_filesystem(&xfs_fs_type); + if (error) + goto out; + return 0; + +out: + destroy_inodecache(); + return error; +} + + +static void __exit exit_xfs_fs(void) +{ + dmapi_uninit(); + xfs_cleanup(); + unregister_filesystem(&xfs_fs_type); + pagebuf_terminate(); + destroy_inodecache(); +} + +module_init(init_xfs_fs); +module_exit(exit_xfs_fs); + +MODULE_AUTHOR("SGI "); +MODULE_DESCRIPTION("SGI XFS " XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled"); +MODULE_LICENSE("GPL"); diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_super.h linux-2.4-xfs/fs/xfs/linux/xfs_super.h --- linux-2.4.19/fs/xfs/linux/xfs_super.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_super.h Sat Aug 24 17:08:33 2002 @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPER_H__ +#define __XFS_SUPER_H__ + +#ifdef CONFIG_FS_POSIX_ACL +# define XFS_ACL_STRING "ACLs, " +#else +# define XFS_ACL_STRING +#endif + +#ifdef CONFIG_XFS_DMAPI +# define XFS_DMAPI_STRING "DMAPI, " +#else +# define XFS_DMAPI_STRING +#endif + +#ifdef CONFIG_XFS_QUOTA +# define XFS_QUOTA_STRING "quota, " +#else +# define XFS_QUOTA_STRING +#endif + +#ifdef CONFIG_XFS_RT +# define XFS_RT_STRING "realtime, " +#else +# define XFS_RT_STRING +#endif + +#ifdef CONFIG_XFS_VNODE_TRACING +# define XFS_VNTRACE_STRING "VN-trace, " +#else +# define XFS_VNTRACE_STRING +#endif + +#ifdef XFSDEBUG +# define XFS_DBG_STRING "debug" +#else +# define XFS_DBG_STRING "no debug" +#endif + +#define XFS_BUILD_OPTIONS XFS_ACL_STRING XFS_DMAPI_STRING \ + XFS_RT_STRING \ + XFS_QUOTA_STRING XFS_VNTRACE_STRING \ + XFS_DBG_STRING /* DBG must be last */ + + +#define LINVFS_GET_VFS(s) \ + (vfs_t *)((s)->u.generic_sbp) +#define LINVFS_SET_VFS(s, vfsp) \ + ((s)->u.generic_sbp = vfsp) + + +struct xfs_mount_args; + +extern void +linvfs_set_inode_ops( + struct inode *inode); + +extern int +spectodevs( + struct super_block *sb, + struct xfs_mount_args *args, + dev_t *ddevp, + dev_t *logdevp, + dev_t *rtdevp); + +#endif /* __XFS_SUPER_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_sysctl.c linux-2.4-xfs/fs/xfs/linux/xfs_sysctl.c --- linux-2.4.19/fs/xfs/linux/xfs_sysctl.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_sysctl.c Fri Aug 30 11:08:18 2002 @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include +#include +#include + +/* + * Tunable xfs parameters + */ + +extern struct xfsstats xfsstats; + +unsigned long xfs_min[XFS_PARAM] = { 0, 0, 0 }; +unsigned long xfs_max[XFS_PARAM] = { XFS_REFCACHE_SIZE_MAX, XFS_REFCACHE_SIZE_MAX, 1 }; + +xfs_param_t xfs_params = { 128, 32, 0 }; + +static struct ctl_table_header *xfs_table_header; + +/* proc handlers */ + +extern void xfs_refcache_resize(int xfs_refcache_new_size); + +static int +xfs_refcache_resize_proc_handler(ctl_table *ctl, int write, struct file * filp, + void *buffer, size_t *lenp) +{ + int ret; + int *valp = ctl->data; + int xfs_refcache_new_size; + int xfs_refcache_old_size = *valp; + + ret = proc_doulongvec_minmax(ctl, write, filp, buffer, lenp); + xfs_refcache_new_size = *valp; + + if (!ret && write && xfs_refcache_new_size != xfs_refcache_old_size) { + xfs_refcache_resize(xfs_refcache_new_size); + /* Don't purge more than size of the cache */ + if (xfs_refcache_new_size < xfs_params.refcache_purge) + xfs_params.refcache_purge = xfs_refcache_new_size; + } + + return ret; +} + +static int +xfs_stats_clear_proc_handler(ctl_table *ctl, int write, struct file * filp, + void *buffer, size_t *lenp) +{ + int ret; + int *valp = ctl->data; + __uint32_t vn_active; + + ret = proc_doulongvec_minmax(ctl, write, filp, buffer, lenp); + + if (!ret && write && *valp) { + printk("XFS Clearing xfsstats\n"); + /* save vn_active, it's a universal truth! */ + vn_active = xfsstats.vn_active; + memset(&xfsstats, 0, sizeof(xfsstats)); + xfsstats.vn_active = vn_active; + xfs_params.stats_clear = 0; + } + + return ret; +} + +static ctl_table xfs_table[] = { + {XFS_REFCACHE_SIZE, "refcache_size", &xfs_params.refcache_size, + sizeof(ulong), 0644, NULL, &xfs_refcache_resize_proc_handler, + &sysctl_intvec, NULL, &xfs_min[0], &xfs_max[0]}, + + {XFS_REFCACHE_PURGE, "refcache_purge", &xfs_params.refcache_purge, + sizeof(ulong), 0644, NULL, &proc_doulongvec_minmax, + &sysctl_intvec, NULL, &xfs_min[1], &xfs_params.refcache_size}, + + {XFS_STATS_CLEAR, "stats_clear", &xfs_params.stats_clear, + sizeof(ulong), 0644, NULL, &xfs_stats_clear_proc_handler, + &sysctl_intvec, NULL, &xfs_min[2], &xfs_max[2]}, + + {0} +}; + +static ctl_table xfs_dir_table[] = { + {FS_XFS, "xfs", NULL, 0, 0555, xfs_table}, + {0} +}; + +static ctl_table xfs_root_table[] = { + {CTL_FS, "fs", NULL, 0, 0555, xfs_dir_table}, + {0} +}; + +void +xfs_sysctl_register(void) +{ + xfs_table_header = register_sysctl_table(xfs_root_table, 1); +} + +void +xfs_sysctl_unregister(void) +{ + if (xfs_table_header) + unregister_sysctl_table(xfs_table_header); +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_sysctl.h linux-2.4-xfs/fs/xfs/linux/xfs_sysctl.h --- linux-2.4.19/fs/xfs/linux/xfs_sysctl.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_sysctl.h Fri Aug 30 11:08:18 2002 @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#ifndef __XFS_SYSCTL_H__ +#define __XFS_SYSCTL_H__ + +#include + +/* + * Tunable xfs parameters + */ + +#define XFS_PARAM 3 + +typedef struct xfs_param { + ulong refcache_size; /* Size of nfs refcache */ + ulong refcache_purge; /* # of entries to purge each time */ + ulong stats_clear; /* reset all xfs stats to 0 */ +} xfs_param_t; + +enum { + XFS_REFCACHE_SIZE = 1, + XFS_REFCACHE_PURGE = 2, + XFS_STATS_CLEAR = 3, +}; + +extern xfs_param_t xfs_params; + +#ifdef CONFIG_SYSCTL +extern void xfs_sysctl_register(void); +extern void xfs_sysctl_unregister(void); +#else +static __inline void xfs_sysctl_register(void) { }; +static __inline void xfs_sysctl_unregister(void) { }; +#endif + +#endif /* __XFS_SYSCTL_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_version.h linux-2.4-xfs/fs/xfs/linux/xfs_version.h --- linux-2.4.19/fs/xfs/linux/xfs_version.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_version.h Wed Jul 31 20:47:05 2002 @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * Dummy file that can contain a timestamp to put into the + * XFS init string, to help users keep track of what they're + * running + */ + +#ifndef __XFS_VERSION_H__ +#define __XFS_VERSION_H__ + +#define XFS_VERSION_STRING "CVS" + +#endif /* __XFS_VERSION_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_vfs.h linux-2.4-xfs/fs/xfs/linux/xfs_vfs.h --- linux-2.4.19/fs/xfs/linux/xfs_vfs.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_vfs.h Wed Sep 4 23:12:13 2002 @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_VFS_H__ +#define __XFS_VFS_H__ + +#include + +struct statfs; +struct vnode; +struct cred; +struct super_block; +struct fid; +struct dm_fcntl_vector; +struct xfs_mount_args; + +typedef struct vfs { + u_int vfs_flag; /* flags */ + fsid_t vfs_fsid; /* file system id */ + fsid_t *vfs_altfsid; /* An ID fixed for life of FS */ + bhv_head_t vfs_bh; /* head of vfs behavior chain */ + struct super_block *vfs_super; /* pointer to super block structure */ +} vfs_t; + +#define vfs_fbhv vfs_bh.bh_first /* 1st on vfs behavior chain */ +#define VFS_FOPS(vfsp) \ + ((vfsops_t *)((vfsp)->vfs_fbhv->bd_ops))/* ops for 1st behavior */ + + +#define bhvtovfs(bdp) ((struct vfs *)BHV_VOBJ(bdp)) +#define VFS_BHVHEAD(vfsp) (&(vfsp)->vfs_bh) + + +#define VFS_RDONLY 0x0001 /* read-only vfs */ +#define VFS_GRPID 0x0002 /* group-ID assigned from directory */ +#define VFS_DMI 0x0004 /* filesystem has the DMI enabled */ + +#define SYNC_ATTR 0x0001 /* sync attributes */ +#define SYNC_CLOSE 0x0002 /* close file system down */ +#define SYNC_DELWRI 0x0004 /* look at delayed writes */ +#define SYNC_WAIT 0x0008 /* wait for i/o to complete */ +#define SYNC_FSDATA 0x0020 /* flush fs data (e.g. superblocks) */ +#define SYNC_BDFLUSH 0x0010 /* BDFLUSH is calling -- don't block */ + + +typedef struct vfsops { + int (*vfs_mount)(struct vfs *, struct xfs_mount_args *, + struct cred *); + /* mount file system */ + int (*vfs_dounmount)(bhv_desc_t *, int, struct vnode *, + struct cred *); + /* preparation and unmount */ + int (*vfs_unmount)(bhv_desc_t *, int, struct cred *); + /* unmount file system */ + int (*vfs_root)(bhv_desc_t *, struct vnode **); + /* get root vnode */ + int (*vfs_statvfs)(bhv_desc_t *, struct statfs *, struct vnode *); + /* get file system statistics */ + int (*vfs_sync)(bhv_desc_t *, int, struct cred *); + /* flush files */ + int (*vfs_vget)(bhv_desc_t *, struct vnode **, struct fid *); + /* get vnode from fid */ + int (*vfs_dmapi_mount)(struct vfs *, char *, char *); + /* send dmapi mount event */ + int (*vfs_dmapi_fsys_vector)(bhv_desc_t *, + struct dm_fcntl_vector *); + void (*vfs_force_shutdown)(bhv_desc_t *, + int, char *, int); +} vfsops_t; + +#define VFS_DOUNMOUNT(vfsp,f,vp,cr, rv) \ +{ \ + BHV_READ_LOCK(&(vfsp)->vfs_bh); \ + rv = (*(VFS_FOPS(vfsp)->vfs_dounmount))((vfsp)->vfs_fbhv, f, vp, cr); \ + BHV_READ_UNLOCK(&(vfsp)->vfs_bh); \ +} +#define VFS_UNMOUNT(vfsp,f,cr, rv) \ +{ \ + BHV_READ_LOCK(&(vfsp)->vfs_bh); \ + rv = (*(VFS_FOPS(vfsp)->vfs_unmount))((vfsp)->vfs_fbhv, f, cr); \ + BHV_READ_UNLOCK(&(vfsp)->vfs_bh); \ +} +#define VFS_ROOT(vfsp, vpp, rv) \ +{ \ + BHV_READ_LOCK(&(vfsp)->vfs_bh); \ + rv = (*(VFS_FOPS(vfsp)->vfs_root))((vfsp)->vfs_fbhv, vpp); \ + BHV_READ_UNLOCK(&(vfsp)->vfs_bh); \ +} +#define VFS_STATVFS(vfsp, sp, vp, rv) \ +{ \ + BHV_READ_LOCK(&(vfsp)->vfs_bh); \ + rv = (*(VFS_FOPS(vfsp)->vfs_statvfs))((vfsp)->vfs_fbhv, sp, vp); \ + BHV_READ_UNLOCK(&(vfsp)->vfs_bh); \ +} +#define VFS_SYNC(vfsp, flag, cr, rv) \ +{ \ + BHV_READ_LOCK(&(vfsp)->vfs_bh); \ + rv = (*(VFS_FOPS(vfsp)->vfs_sync))((vfsp)->vfs_fbhv, flag, cr); \ + BHV_READ_UNLOCK(&(vfsp)->vfs_bh); \ +} +#define VFS_VGET(vfsp, vpp, fidp, rv) \ +{ \ + BHV_READ_LOCK(&(vfsp)->vfs_bh); \ + rv = (*(VFS_FOPS(vfsp)->vfs_vget))((vfsp)->vfs_fbhv, vpp, fidp); \ + BHV_READ_UNLOCK(&(vfsp)->vfs_bh); \ +} +/* No behavior lock here */ +#define VFS_FORCE_SHUTDOWN(vfsp, flags) \ + (*(VFS_FOPS(vfsp)->vfs_force_shutdown))((vfsp)->vfs_fbhv, flags, __FILE__, __LINE__); + +#define VFS_DMAPI_FSYS_VECTOR(vfsp, df, rv) \ +{ \ + BHV_READ_LOCK(&(vfsp)->vfs_bh); \ + rv = (*(VFS_FOPS(vfsp)->vfs_dmapi_fsys_vector))((vfsp)->vfs_fbhv, df); \ + BHV_READ_UNLOCK(&(vfsp)->vfs_bh); \ +} + + +#define VFSOPS_DMAPI_MOUNT(vfs_op, vfsp, dir_name, fsname, rv) \ + rv = (*(vfs_op)->vfs_dmapi_mount)(vfsp, dir_name, fsname) +#define VFSOPS_MOUNT(vfs_op, vfsp, args, cr, rv) \ + rv = (*(vfs_op)->vfs_mount)(vfsp, args, cr) + +#define VFS_REMOVEBHV(vfsp, bdp)\ +{ \ + bhv_remove(VFS_BHVHEAD(vfsp), bdp); \ +} + +#define PVFS_UNMOUNT(bdp,f,cr, rv) \ +{ \ + rv = (*((vfsops_t *)(bdp)->bd_ops)->vfs_unmount)(bdp, f, cr); \ +} + +#define PVFS_SYNC(bdp, flag, cr, rv) \ +{ \ + rv = (*((vfsops_t *)(bdp)->bd_ops)->vfs_sync)(bdp, flag, cr); \ +} + + +static __inline vfs_t * +vfs_allocate(void) +{ + vfs_t *vfsp; + + vfsp = kmalloc(sizeof(vfs_t), GFP_KERNEL); + if (vfsp) { + memset(vfsp, 0, sizeof(vfs_t)); + bhv_head_init(VFS_BHVHEAD(vfsp), "vfs"); + } + return (vfsp); +} + +static __inline void +vfs_deallocate( + vfs_t *vfsp) +{ + bhv_head_destroy(VFS_BHVHEAD(vfsp)); + kfree(vfsp); +} + +/* + * Called by fs dependent VFS_MOUNT code to link the VFS base file system + * dependent behavior with the VFS virtual object. + */ +static __inline void +vfs_insertbhv( + vfs_t *vfsp, + bhv_desc_t *bdp, + vfsops_t *vfsops, + void *mount) +{ + /* + * Initialize behavior desc with ops and data and then + * attach it to the vfs. + */ + bhv_desc_init(bdp, mount, vfsp, vfsops); + bhv_insert_initial(&vfsp->vfs_bh, bdp); +} + +#endif /* __XFS_VFS_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_vnode.c linux-2.4-xfs/fs/xfs/linux/xfs_vnode.c --- linux-2.4.19/fs/xfs/linux/xfs_vnode.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_vnode.c Thu Sep 5 15:35:08 2002 @@ -0,0 +1,468 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include +#include + + +uint64_t vn_generation; /* vnode generation number */ + +spinlock_t vnumber_lock = SPIN_LOCK_UNLOCKED; + +/* + * Dedicated vnode inactive/reclaim sync semaphores. + * Prime number of hash buckets since address is used as the key. + */ +#define NVSYNC 37 +#define vptosync(v) (&vsync[((unsigned long)v) % NVSYNC]) +sv_t vsync[NVSYNC]; + +/* + * Translate stat(2) file types to vnode types and vice versa. + * Aware of numeric order of S_IFMT and vnode type values. + */ +enum vtype iftovt_tab[] = { + VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, + VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON +}; + +u_short vttoif_tab[] = { + 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO, 0, S_IFSOCK +}; + +#define VN_LOCK(vp) spin_lock(&(vp)->v_lock) +#define VN_UNLOCK(vp) spin_unlock(&(vp)->v_lock) + + +void +vn_init(void) +{ + register sv_t *svp; + register int i; + + for (svp = vsync, i = 0; i < NVSYNC; i++, svp++) + init_sv(svp, SV_DEFAULT, "vsy", i); +} + + +/* + * Clean a vnode of filesystem-specific data and prepare it for reuse. + */ +STATIC int +vn_reclaim(struct vnode *vp) +{ + int error; + + XFS_STATS_INC(xfsstats.vn_reclaim); + + vn_trace_entry(vp, "vn_reclaim", (inst_t *)__return_address); + + /* + * Only make the VOP_RECLAIM call if there are behaviors + * to call. + */ + if (vp->v_fbhv != NULL) { + VOP_RECLAIM(vp, error); + if (error) + return -error; + } + ASSERT(vp->v_fbhv == NULL); + + VN_LOCK(vp); + + vp->v_flag &= (VRECLM|VWAIT); + VN_UNLOCK(vp); + + vp->v_type = VNON; + vp->v_fbhv = NULL; + +#ifdef CONFIG_XFS_VNODE_TRACING + ktrace_free(vp->v_trace); + vp->v_trace = NULL; +#endif + + return 0; +} + +STATIC void +vn_wakeup(struct vnode *vp) +{ + VN_LOCK(vp); + if (vp->v_flag & VWAIT) { + sv_broadcast(vptosync(vp)); + } + vp->v_flag &= ~(VRECLM|VWAIT|VMODIFIED); + VN_UNLOCK(vp); +} + +int +vn_wait(struct vnode *vp) +{ + VN_LOCK(vp); + if (vp->v_flag & (VINACT | VRECLM)) { + vp->v_flag |= VWAIT; + sv_wait(vptosync(vp), PINOD, &vp->v_lock, 0); + return 1; + } + VN_UNLOCK(vp); + return 0; +} + +struct vnode * +vn_initialize(struct inode *inode) +{ + struct vnode *vp = LINVFS_GET_VP(inode); + + XFS_STATS_INC(xfsstats.vn_active); + + vp->v_flag = VMODIFIED; + spinlock_init(&vp->v_lock, "v_lock"); + + spin_lock(&vnumber_lock); + if (!++vn_generation) /* v_number shouldn't be zero */ + vn_generation++; + vp->v_number = vn_generation; + spin_unlock(&vnumber_lock); + + ASSERT(VN_CACHED(vp) == 0); + + /* Initialize the first behavior and the behavior chain head. */ + vn_bhv_head_init(VN_BHV_HEAD(vp), "vnode"); + +#ifdef CONFIG_XFS_VNODE_TRACING + vp->v_trace = ktrace_alloc(VNODE_TRACE_SIZE, KM_SLEEP); +#endif /* CONFIG_XFS_VNODE_TRACING */ + + vn_trace_exit(vp, "vn_initialize", (inst_t *)__return_address); + return vp; +} + +/* + * Get a reference on a vnode. + */ +vnode_t * +vn_get(struct vnode *vp, vmap_t *vmap) +{ + struct inode *inode; + + XFS_STATS_INC(xfsstats.vn_get); + inode = LINVFS_GET_IP(vp); + if (inode->i_state & I_FREEING) + return NULL; + + inode = iget_locked(vmap->v_vfsp->vfs_super, vmap->v_ino); + if (inode == NULL) /* Inode not present */ + return NULL; + + /* We do not want to create new inodes via vn_get, + * returning NULL here is OK. + */ + if (inode->i_state & I_NEW) { + make_bad_inode(inode); + unlock_new_inode(inode); + iput(inode); + return NULL; + } + + vn_trace_exit(vp, "vn_get", (inst_t *)__return_address); + ASSERT((vp->v_flag & VPURGE) == 0); + + return vp; +} + +/* + * "revalidate" the linux inode. + */ +int +vn_revalidate(struct vnode *vp, int flags) +{ + int error; + struct inode *inode; + vattr_t va; + + vn_trace_entry(vp, "vn_revalidate", (inst_t *)__return_address); + + va.va_mask = AT_STAT|AT_GENCOUNT; + + ASSERT(vp->v_bh.bh_first != NULL); + + VOP_GETATTR(vp, &va, flags & ATTR_LAZY, NULL, error); + + if (! error) { + inode = LINVFS_GET_IP(vp); + ASSERT(inode); + + inode->i_mode = VTTOIF(va.va_type) | va.va_mode; + inode->i_nlink = va.va_nlink; + inode->i_uid = va.va_uid; + inode->i_gid = va.va_gid; + inode->i_rdev = mk_kdev(MAJOR(va.va_rdev), + MINOR(va.va_rdev)); + inode->i_blksize = PAGE_CACHE_SIZE; + inode->i_generation = va.va_gencount; + if ((flags & ATTR_COMM) || + S_ISREG(inode->i_mode) || + S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) { + inode->i_size = va.va_size; + inode->i_blocks = va.va_nblocks; + inode->i_atime = va.va_atime.tv_sec; + inode->i_mtime = va.va_mtime.tv_sec; + inode->i_ctime = va.va_ctime.tv_sec; + } + if (flags & ATTR_LAZY) + vp->v_flag &= ~VMODIFIED; + else + VUNMODIFY(vp); + } else { + vn_trace_exit(vp, "vn_revalidate.error", + (inst_t *)__return_address); + } + + return -error; +} + + +/* + * purge a vnode from the cache + * At this point the vnode is guaranteed to have no references (vn_count == 0) + * The caller has to make sure that there are no ways someone could + * get a handle (via vn_get) on the vnode (usually done via a mount/vfs lock). + */ +void +vn_purge(struct vnode *vp, vmap_t *vmap) +{ + vn_trace_entry(vp, "vn_purge", (inst_t *)__return_address); + + ASSERT(vp->v_flag & VPURGE); + +again: + /* + * Check whether vp has already been reclaimed since our caller + * sampled its version while holding a filesystem cache lock that + * its VOP_RECLAIM function acquires. + */ + VN_LOCK(vp); + if (vp->v_number != vmap->v_number) { + VN_UNLOCK(vp); + return; + } + + /* + * If vp is being reclaimed or inactivated, wait until it is inert, + * then proceed. Can't assume that vnode is actually reclaimed + * just because the reclaimed flag is asserted -- a vn_alloc + * reclaim can fail. + */ + if (vp->v_flag & (VINACT | VRECLM)) { + ASSERT(vn_count(vp) == 0); + vp->v_flag |= VWAIT; + sv_wait(vptosync(vp), PINOD, &vp->v_lock, 0); + goto again; + } + + /* + * Another process could have raced in and gotten this vnode... + */ + if (vn_count(vp) > 0) { + VN_UNLOCK(vp); + return; + } + + XFS_STATS_DEC(xfsstats.vn_active); + vp->v_flag |= VRECLM; + VN_UNLOCK(vp); + + /* + * Call VOP_RECLAIM and clean vp. The FSYNC_INVAL flag tells + * vp's filesystem to flush and invalidate all cached resources. + * When vn_reclaim returns, vp should have no private data, + * either in a system cache or attached to v_data. + */ + if (vn_reclaim(vp) != 0) + panic("vn_purge: cannot reclaim"); + + /* + * Wakeup anyone waiting for vp to be reclaimed. + */ + vn_wakeup(vp); +} + +/* + * Add a reference to a referenced vnode. + */ +struct vnode * +vn_hold(struct vnode *vp) +{ + struct inode *inode; + + XFS_STATS_INC(xfsstats.vn_hold); + + VN_LOCK(vp); + inode = igrab(LINVFS_GET_IP(vp)); + ASSERT(inode); + VN_UNLOCK(vp); + + return vp; +} + +/* + * Call VOP_INACTIVE on last reference. + */ +void +vn_rele(struct vnode *vp) +{ + int vcnt; + /* REFERENCED */ + int cache; + + XFS_STATS_INC(xfsstats.vn_rele); + + + VN_LOCK(vp); + + vn_trace_entry(vp, "vn_rele", (inst_t *)__return_address); + vcnt = vn_count(vp); + + /* + * Since we always get called from put_inode we know + * that i_count won't be decremented after we + * return. + */ + if (vcnt == 0) { + /* + * As soon as we turn this on, noone can find us in vn_get + * until we turn off VINACT or VRECLM + */ + vp->v_flag |= VINACT; + VN_UNLOCK(vp); + + /* + * Do not make the VOP_INACTIVE call if there + * are no behaviors attached to the vnode to call. + */ + if (vp->v_fbhv != NULL) { + VOP_INACTIVE(vp, NULL, cache); + } + + VN_LOCK(vp); + if (vp->v_flag & VWAIT) { + if (vp->v_flag & VWAIT) { + sv_broadcast(vptosync(vp)); + } + } + + vp->v_flag &= ~(VINACT|VWAIT|VRECLM|VMODIFIED); + + } + + VN_UNLOCK(vp); + + vn_trace_exit(vp, "vn_rele", (inst_t *)__return_address); +} + + +/* + * Finish the removal of a vnode. + */ +void +vn_remove(struct vnode *vp) +{ + /* REFERENCED */ + vmap_t vmap; + + /* Make sure we don't do this to the same vnode twice */ + if (!(vp->v_fbhv)) + return; + + XFS_STATS_INC(xfsstats.vn_remove); + + vn_trace_exit(vp, "vn_remove", (inst_t *)__return_address); + + /* + * After the following purge the vnode + * will no longer exist. + */ + VMAP(vp, XFS_BHVTOI(vp->v_fbhv), vmap); + + vn_purge(vp, &vmap); +} + + +#ifdef CONFIG_XFS_VNODE_TRACING + +#define KTRACE_ENTER(vp, vk, s, line, ra) \ + ktrace_enter( (vp)->v_trace, \ +/* 0 */ (void *)(__psint_t)(vk), \ +/* 1 */ (void *)(s), \ +/* 2 */ (void *)(__psint_t) line, \ +/* 3 */ (void *)(vn_count(vp)), \ +/* 4 */ (void *)(ra), \ +/* 5 */ (void *)(__psunsigned_t)(vp)->v_flag, \ +/* 6 */ (void *)(__psint_t)smp_processor_id(), \ +/* 7 */ (void *)(__psint_t)(current->pid), \ +/* 8 */ (void *)__return_address, \ +/* 9 */ 0, 0, 0, 0, 0, 0, 0) + +/* + * Vnode tracing code. + */ +void +vn_trace_entry(vnode_t *vp, char *func, inst_t *ra) +{ + KTRACE_ENTER(vp, VNODE_KTRACE_ENTRY, func, 0, ra); +} + +void +vn_trace_exit(vnode_t *vp, char *func, inst_t *ra) +{ + KTRACE_ENTER(vp, VNODE_KTRACE_EXIT, func, 0, ra); +} + +void +vn_trace_hold(vnode_t *vp, char *file, int line, inst_t *ra) +{ + KTRACE_ENTER(vp, VNODE_KTRACE_HOLD, file, line, ra); +} + +void +vn_trace_ref(vnode_t *vp, char *file, int line, inst_t *ra) +{ + KTRACE_ENTER(vp, VNODE_KTRACE_REF, file, line, ra); +} + +void +vn_trace_rele(vnode_t *vp, char *file, int line, inst_t *ra) +{ + KTRACE_ENTER(vp, VNODE_KTRACE_RELE, file, line, ra); +} +#endif /* CONFIG_XFS_VNODE_TRACING */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/linux/xfs_vnode.h linux-2.4-xfs/fs/xfs/linux/xfs_vnode.h --- linux-2.4.19/fs/xfs/linux/xfs_vnode.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/linux/xfs_vnode.h Wed Aug 28 23:25:32 2002 @@ -0,0 +1,764 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_VNODE_H__ +#define __XFS_VNODE_H__ + +/* + * Vnode types (unrelated to on-disk inodes). VNON means no type. + */ +typedef enum vtype { + VNON = 0, + VREG = 1, + VDIR = 2, + VBLK = 3, + VCHR = 4, + VLNK = 5, + VFIFO = 6, + VBAD = 7, + VSOCK = 8 +} vtype_t; + +typedef __u64 vnumber_t; + +/* + * Define the type of behavior head used by vnodes. + */ +#define vn_bhv_head_t bhv_head_t + +/* + * MP locking protocols: + * v_flag, v_count VN_LOCK/VN_UNLOCK + * v_vfsp VN_LOCK/VN_UNLOCK + * v_type read-only or fs-dependent + * v_list, v_hashp, v_hashn freelist lock + */ +typedef struct vnode { + __u32 v_flag; /* vnode flags (see below) */ + enum vtype v_type; /* vnode type */ + struct vfs *v_vfsp; /* ptr to containing VFS*/ + vnumber_t v_number; /* in-core vnode number */ + vn_bhv_head_t v_bh; /* behavior head */ + + spinlock_t v_lock; /* don't use VLOCK on Linux */ + struct inode v_inode; /* linux inode */ +#ifdef CONFIG_XFS_VNODE_TRACING + struct ktrace *v_trace; /* trace header structure */ +#endif /* CONFIG_XFS_VNODE_TRACING */ +} vnode_t; + +/* + * Vnode to Linux inode mapping. + */ +#define LINVFS_GET_VP(inode) ((vnode_t *)list_entry(inode, vnode_t, v_inode)) +#define LINVFS_GET_IP(vp) (&(vp)->v_inode) + +/* + * Conversion between vnode types/modes and encoded type/mode as + * seen by stat(2) and mknod(2). + */ +extern enum vtype iftovt_tab[]; +extern ushort vttoif_tab[]; +#define IFTOVT(M) (iftovt_tab[((M) & S_IFMT) >> 12]) +#define VTTOIF(T) (vttoif_tab[(int)(T)]) +#define MAKEIMODE(T, M) (VTTOIF(T) | ((M) & ~S_IFMT)) + +/* + * Vnode flags. + * + * The vnode flags fall into two categories: + * 1) Local only - + * Flags that are relevant only to a particular cell + * 2) Single system image - + * Flags that must be maintained coherent across all cells + */ + /* Local only flags */ +#define VINACT 0x1 /* vnode is being inactivated */ +#define VRECLM 0x2 /* vnode is being reclaimed */ +#define VWAIT 0x4 /* waiting for VINACT + or VRECLM to finish */ +#define VMODIFIED 0x8 /* xfs inode state possibly different + * from linux inode state. + */ + +/* Single system image flags */ +#define VROOT 0x100000 /* root of its file system */ +#define VNOSWAP 0x200000 /* cannot be used as virt swap device */ +#define VISSWAP 0x400000 /* vnode is part of virt swap device */ +#define VREPLICABLE 0x800000 /* Vnode can have replicated pages */ +#define VNONREPLICABLE 0x1000000 /* Vnode has writers. Don't replicate */ +#define VDOCMP 0x2000000 /* Vnode has special VOP_CMP impl. */ +#define VSHARE 0x4000000 /* vnode part of global cache */ + /* VSHARE applies to local cell only */ +#define VFRLOCKS 0x8000000 /* vnode has FR locks applied */ +#define VENF_LOCKING 0x10000000 /* enf. mode FR locking in effect */ +#define VOPLOCK 0x20000000 /* oplock set on the vnode */ +#define VPURGE 0x40000000 /* In the linux 'put' thread */ + +typedef enum vrwlock { VRWLOCK_NONE, VRWLOCK_READ, + VRWLOCK_WRITE, VRWLOCK_WRITE_DIRECT, + VRWLOCK_TRY_READ, VRWLOCK_TRY_WRITE } vrwlock_t; + +/* + * Return values for VOP_INACTIVE. A return value of + * VN_INACTIVE_NOCACHE implies that the file system behavior + * has disassociated its state and bhv_desc_t from the vnode. + */ +#define VN_INACTIVE_CACHE 0 +#define VN_INACTIVE_NOCACHE 1 + +/* + * Values for the cmd code given to VOP_VNODE_CHANGE. + */ +typedef enum vchange { + VCHANGE_FLAGS_FRLOCKS = 0, + VCHANGE_FLAGS_ENF_LOCKING = 1, + VCHANGE_FLAGS_TRUNCATED = 2, + VCHANGE_FLAGS_PAGE_DIRTY = 3, + VCHANGE_FLAGS_IOEXCL_COUNT = 4 +} vchange_t; + +/* + * Macros for dealing with the behavior descriptor inside of the vnode. + */ +#define BHV_TO_VNODE(bdp) ((vnode_t *)BHV_VOBJ(bdp)) +#define BHV_TO_VNODE_NULL(bdp) ((vnode_t *)BHV_VOBJNULL(bdp)) + +#define VNODE_TO_FIRST_BHV(vp) (BHV_HEAD_FIRST(&(vp)->v_bh)) +#define VN_BHV_HEAD(vp) ((vn_bhv_head_t *)(&((vp)->v_bh))) +#define VN_BHV_READ_LOCK(bhp) BHV_READ_LOCK(bhp) +#define VN_BHV_READ_UNLOCK(bhp) BHV_READ_UNLOCK(bhp) +#define VN_BHV_WRITE_LOCK(bhp) BHV_WRITE_LOCK(bhp) +#define VN_BHV_NOT_READ_LOCKED(bhp) BHV_NOT_READ_LOCKED(bhp) +#define VN_BHV_NOT_WRITE_LOCKED(bhp) BHV_NOT_WRITE_LOCKED(bhp) +#define vn_bhv_head_init(bhp,name) bhv_head_init(bhp,name) +#define vn_bhv_head_reinit(bhp) bhv_head_reinit(bhp) +#define vn_bhv_insert_initial(bhp,bdp) bhv_insert_initial(bhp,bdp) +#define vn_bhv_remove(bhp,bdp) bhv_remove(bhp,bdp) +#define vn_bhv_lookup(bhp,ops) bhv_lookup(bhp,ops) +#define vn_bhv_lookup_unlocked(bhp,ops) bhv_lookup_unlocked(bhp,ops) + +#define v_fbhv v_bh.bh_first /* first behavior */ +#define v_fops v_bh.bh_first->bd_ops /* ops for first behavior */ + + +union rval; +struct uio; +struct file; +struct vattr; +struct page_buf_bmap_s; +struct attrlist_cursor_kern; + +typedef int (*vop_open_t)(bhv_desc_t *, struct cred *); +typedef ssize_t (*vop_read_t)(bhv_desc_t *, struct file *, char *, size_t, + loff_t *, struct cred *); +typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct file *, const char *, size_t, + loff_t *, struct cred *); +typedef int (*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *, unsigned int, unsigned long); +typedef int (*vop_getattr_t)(bhv_desc_t *, struct vattr *, int, + struct cred *); +typedef int (*vop_setattr_t)(bhv_desc_t *, struct vattr *, int, + struct cred *); +typedef int (*vop_access_t)(bhv_desc_t *, int, struct cred *); +typedef int (*vop_lookup_t)(bhv_desc_t *, struct dentry *, vnode_t **, + int, vnode_t *, struct cred *); +typedef int (*vop_create_t)(bhv_desc_t *, struct dentry *, struct vattr *, + vnode_t **, struct cred *); +typedef int (*vop_remove_t)(bhv_desc_t *, struct dentry *, struct cred *); +typedef int (*vop_link_t)(bhv_desc_t *, vnode_t *, struct dentry *, + struct cred *); +typedef int (*vop_rename_t)(bhv_desc_t *, struct dentry *, vnode_t *, + struct dentry *, struct cred *); +typedef int (*vop_mkdir_t)(bhv_desc_t *, struct dentry *, struct vattr *, + vnode_t **, struct cred *); +typedef int (*vop_rmdir_t)(bhv_desc_t *, struct dentry *, struct cred *); +typedef int (*vop_readdir_t)(bhv_desc_t *, struct uio *, struct cred *, + int *); +typedef int (*vop_symlink_t)(bhv_desc_t *, struct dentry *, + struct vattr *, char *, + vnode_t **, struct cred *); +typedef int (*vop_readlink_t)(bhv_desc_t *, struct uio *, struct cred *); +typedef int (*vop_fsync_t)(bhv_desc_t *, int, struct cred *, xfs_off_t, xfs_off_t); +typedef int (*vop_inactive_t)(bhv_desc_t *, struct cred *); +typedef int (*vop_fid2_t)(bhv_desc_t *, struct fid *); +typedef int (*vop_release_t)(bhv_desc_t *); +typedef int (*vop_rwlock_t)(bhv_desc_t *, vrwlock_t); +typedef void (*vop_rwunlock_t)(bhv_desc_t *, vrwlock_t); +typedef int (*vop_bmap_t)(bhv_desc_t *, xfs_off_t, ssize_t, int, struct cred *, struct page_buf_bmap_s *, int *); +typedef int (*vop_strategy_t)(bhv_desc_t *, xfs_off_t, ssize_t, int, struct cred *, struct page_buf_bmap_s *, int *); +typedef int (*vop_reclaim_t)(bhv_desc_t *); +typedef int (*vop_attr_get_t)(bhv_desc_t *, char *, char *, int *, int, + struct cred *); +typedef int (*vop_attr_set_t)(bhv_desc_t *, char *, char *, int, int, + struct cred *); +typedef int (*vop_attr_remove_t)(bhv_desc_t *, char *, int, struct cred *); +typedef int (*vop_attr_list_t)(bhv_desc_t *, char *, int, int, + struct attrlist_cursor_kern *, struct cred *); +typedef void (*vop_link_removed_t)(bhv_desc_t *, vnode_t *, int); +typedef void (*vop_vnode_change_t)(bhv_desc_t *, vchange_t, __psint_t); +typedef void (*vop_ptossvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int); +typedef void (*vop_pflushinvalvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int); +typedef int (*vop_pflushvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, uint64_t, int); +typedef int (*vop_iflush_t)(bhv_desc_t *, int); + + +typedef struct vnodeops { + bhv_position_t vn_position; /* position within behavior chain */ + vop_open_t vop_open; + vop_read_t vop_read; + vop_write_t vop_write; + vop_ioctl_t vop_ioctl; + vop_getattr_t vop_getattr; + vop_setattr_t vop_setattr; + vop_access_t vop_access; + vop_lookup_t vop_lookup; + vop_create_t vop_create; + vop_remove_t vop_remove; + vop_link_t vop_link; + vop_rename_t vop_rename; + vop_mkdir_t vop_mkdir; + vop_rmdir_t vop_rmdir; + vop_readdir_t vop_readdir; + vop_symlink_t vop_symlink; + vop_readlink_t vop_readlink; + vop_fsync_t vop_fsync; + vop_inactive_t vop_inactive; + vop_fid2_t vop_fid2; + vop_rwlock_t vop_rwlock; + vop_rwunlock_t vop_rwunlock; + vop_bmap_t vop_bmap; + vop_strategy_t vop_strategy; + vop_reclaim_t vop_reclaim; + vop_attr_get_t vop_attr_get; + vop_attr_set_t vop_attr_set; + vop_attr_remove_t vop_attr_remove; + vop_attr_list_t vop_attr_list; + vop_link_removed_t vop_link_removed; + vop_vnode_change_t vop_vnode_change; + vop_ptossvp_t vop_tosspages; + vop_pflushinvalvp_t vop_flushinval_pages; + vop_pflushvp_t vop_flush_pages; + vop_release_t vop_release; + vop_iflush_t vop_iflush; +} vnodeops_t; + +/* + * VOP's. + */ +#define _VOP_(op, vp) (*((vnodeops_t *)(vp)->v_fops)->op) + +/* + * Be careful with VOP_OPEN, since we're holding the chain lock on the + * original vnode and VOP_OPEN semantic allows the new vnode to be returned + * in vpp. The practice of passing &vp for vpp just doesn't work. + */ +#define VOP_READ(vp,file,buf,size,offset,cr,rv) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + rv = _VOP_(vop_read, vp)((vp)->v_fbhv,file,buf,size,offset,cr); \ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} +#define VOP_WRITE(vp,file,buf,size,offset,cr,rv) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + rv = _VOP_(vop_write, vp)((vp)->v_fbhv,file,buf,size,offset,cr);\ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} +#define VOP_BMAP(vp,of,sz,rw,cr,b,n,rv) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + rv = _VOP_(vop_bmap, vp)((vp)->v_fbhv,of,sz,rw,cr,b,n); \ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} +#define VOP_STRATEGY(vp,of,sz,rw,cr,b,n,rv) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + rv = _VOP_(vop_strategy, vp)((vp)->v_fbhv,of,sz,rw,cr,b,n); \ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} +#define VOP_OPEN(vp, cr, rv) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + rv = _VOP_(vop_open, vp)((vp)->v_fbhv, cr); \ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} +#define VOP_GETATTR(vp, vap, f, cr, rv) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + rv = _VOP_(vop_getattr, vp)((vp)->v_fbhv, vap, f, cr); \ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} +#define VOP_SETATTR(vp, vap, f, cr, rv) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + rv = _VOP_(vop_setattr, vp)((vp)->v_fbhv, vap, f, cr); \ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} +#define VOP_ACCESS(vp, mode, cr, rv) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + rv = _VOP_(vop_access, vp)((vp)->v_fbhv, mode, cr); \ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} +#define VOP_LOOKUP(vp,d,vpp,f,rdir,cr,rv) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + rv = _VOP_(vop_lookup, vp)((vp)->v_fbhv,d,vpp,f,rdir,cr); \ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} +#define VOP_CREATE(dvp,d,vap,vpp,cr,rv) \ +{ \ + VN_BHV_READ_LOCK(&(dvp)->v_bh); \ + rv = _VOP_(vop_create, dvp)((dvp)->v_fbhv,d,vap,vpp,cr); \ + VN_BHV_READ_UNLOCK(&(dvp)->v_bh); \ +} +#define VOP_REMOVE(dvp,d,cr,rv) \ +{ \ + VN_BHV_READ_LOCK(&(dvp)->v_bh); \ + rv = _VOP_(vop_remove, dvp)((dvp)->v_fbhv,d,cr); \ + VN_BHV_READ_UNLOCK(&(dvp)->v_bh); \ +} +#define VOP_LINK(tdvp,fvp,d,cr,rv) \ +{ \ + VN_BHV_READ_LOCK(&(tdvp)->v_bh); \ + rv = _VOP_(vop_link, tdvp)((tdvp)->v_fbhv,fvp,d,cr); \ + VN_BHV_READ_UNLOCK(&(tdvp)->v_bh); \ +} +#define VOP_RENAME(fvp,fnm,tdvp,tnm,cr,rv) \ +{ \ + VN_BHV_READ_LOCK(&(fvp)->v_bh); \ + rv = _VOP_(vop_rename, fvp)((fvp)->v_fbhv,fnm,tdvp,tnm,cr); \ + VN_BHV_READ_UNLOCK(&(fvp)->v_bh); \ +} +#define VOP_MKDIR(dp,d,vap,vpp,cr,rv) \ +{ \ + VN_BHV_READ_LOCK(&(dp)->v_bh); \ + rv = _VOP_(vop_mkdir, dp)((dp)->v_fbhv,d,vap,vpp,cr); \ + VN_BHV_READ_UNLOCK(&(dp)->v_bh); \ +} +#define VOP_RMDIR(dp,d,cr,rv) \ +{ \ + VN_BHV_READ_LOCK(&(dp)->v_bh); \ + rv = _VOP_(vop_rmdir, dp)((dp)->v_fbhv,d,cr); \ + VN_BHV_READ_UNLOCK(&(dp)->v_bh); \ +} +#define VOP_READDIR(vp,uiop,cr,eofp,rv) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + rv = _VOP_(vop_readdir, vp)((vp)->v_fbhv,uiop,cr,eofp); \ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} +#define VOP_SYMLINK(dvp,d,vap,tnm,vpp,cr,rv) \ +{ \ + VN_BHV_READ_LOCK(&(dvp)->v_bh); \ + rv = _VOP_(vop_symlink, dvp) ((dvp)->v_fbhv,d,vap,tnm,vpp,cr); \ + VN_BHV_READ_UNLOCK(&(dvp)->v_bh); \ +} +#define VOP_READLINK(vp,uiop,cr,rv) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + rv = _VOP_(vop_readlink, vp)((vp)->v_fbhv,uiop,cr); \ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} +#define VOP_FSYNC(vp,f,cr,b,e,rv) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + rv = _VOP_(vop_fsync, vp)((vp)->v_fbhv,f,cr,b,e); \ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} +#define VOP_INACTIVE(vp, cr, rv) \ +{ /* vnode not reference-able, so no need to lock chain */ \ + rv = _VOP_(vop_inactive, vp)((vp)->v_fbhv, cr); \ +} +#define VOP_RELEASE(vp, rv) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + rv = _VOP_(vop_release, vp)((vp)->v_fbhv); \ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} +#define VOP_FID2(vp, fidp, rv) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + rv = _VOP_(vop_fid2, vp)((vp)->v_fbhv, fidp); \ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} +#define VOP_RWLOCK(vp,i) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + (void)_VOP_(vop_rwlock, vp)((vp)->v_fbhv, i); \ + /* "allow" is done by rwunlock */ \ +} +#define VOP_RWLOCK_TRY(vp,i) \ + _VOP_(vop_rwlock, vp)((vp)->v_fbhv, i) + +#define VOP_RWUNLOCK(vp,i) \ +{ /* "prevent" was done by rwlock */ \ + (void)_VOP_(vop_rwunlock, vp)((vp)->v_fbhv, i); \ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} +#define VOP_RECLAIM(vp, rv) \ +{ /* vnode not reference-able, so no need to lock chain */ \ + rv = _VOP_(vop_reclaim, vp)((vp)->v_fbhv); \ +} +#define VOP_ATTR_GET(vp, name, val, vallenp, fl, cred, rv) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + rv = _VOP_(vop_attr_get, vp)((vp)->v_fbhv,name,val,vallenp,fl,cred); \ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} +#define VOP_ATTR_SET(vp, name, val, vallen, fl, cred, rv) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + rv = _VOP_(vop_attr_set, vp)((vp)->v_fbhv,name,val,vallen,fl,cred); \ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} +#define VOP_ATTR_REMOVE(vp, name, flags, cred, rv) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + rv = _VOP_(vop_attr_remove, vp)((vp)->v_fbhv,name,flags,cred); \ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} +#define VOP_ATTR_LIST(vp, buf, buflen, fl, cursor, cred, rv) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + rv = _VOP_(vop_attr_list, vp)((vp)->v_fbhv,buf,buflen,fl,cursor,cred);\ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} +#define VOP_LINK_REMOVED(vp, dvp, linkzero) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + (void)_VOP_(vop_link_removed, vp)((vp)->v_fbhv, dvp, linkzero); \ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} +#define VOP_VNODE_CHANGE(vp, cmd, val) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + (void)_VOP_(vop_vnode_change, vp)((vp)->v_fbhv,cmd,val); \ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} +/* + * These are page cache functions that now go thru VOPs. + * 'last' parameter is unused and left in for IRIX compatibility + */ +#define VOP_TOSS_PAGES(vp, first, last, fiopt) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + _VOP_(vop_tosspages, vp)((vp)->v_fbhv,first, last, fiopt); \ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} +/* + * 'last' parameter is unused and left in for IRIX compatibility + */ +#define VOP_FLUSHINVAL_PAGES(vp, first, last, fiopt) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + _VOP_(vop_flushinval_pages, vp)((vp)->v_fbhv,first,last,fiopt); \ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} +/* + * 'last' parameter is unused and left in for IRIX compatibility + */ +#define VOP_FLUSH_PAGES(vp, first, last, flags, fiopt, rv) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + rv = _VOP_(vop_flush_pages, vp)((vp)->v_fbhv,first,last,flags,fiopt);\ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} +#define VOP_IOCTL(vp, inode, filp, cmd, arg, rv) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + rv = _VOP_(vop_ioctl, vp)((vp)->v_fbhv,inode,filp,cmd,arg); \ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} +#define VOP_IFLUSH(vp, flags, rv) \ +{ \ + VN_BHV_READ_LOCK(&(vp)->v_bh); \ + rv = _VOP_(vop_iflush, vp)((vp)->v_fbhv, flags); \ + VN_BHV_READ_UNLOCK(&(vp)->v_bh); \ +} + +/* + * Flags for VOP_IFLUSH call + */ + +#define FLUSH_SYNC 1 /* wait for flush to complete */ +#define FLUSH_INODE 2 /* flush the inode itself */ +#define FLUSH_LOG 4 /* force the last log entry for + * this inode out to disk */ + +/* + * Flush/Invalidate options for VOP_TOSS_PAGES, VOP_FLUSHINVAL_PAGES and + * VOP_FLUSH_PAGES. + */ +#define FI_NONE 0 /* none */ +#define FI_REMAPF 1 /* Do a remapf prior to the operation */ +#define FI_REMAPF_LOCKED 2 /* Do a remapf prior to the operation. + Prevent VM access to the pages until + the operation completes. */ + +/* + * Vnode attributes. va_mask indicates those attributes the caller + * wants to set (setattr) or extract (getattr). + */ +typedef struct vattr { + int va_mask; /* bit-mask of attributes */ + vtype_t va_type; /* vnode type (for create) */ + mode_t va_mode; /* file access mode */ + uid_t va_uid; /* owner user id */ + gid_t va_gid; /* owner group id */ + dev_t va_fsid; /* file system id (dev for now) */ + xfs_ino_t va_nodeid; /* node id */ + nlink_t va_nlink; /* number of references to file */ + xfs_off_t va_size; /* file size in bytes */ + timespec_t va_atime; /* time of last access */ + timespec_t va_mtime; /* time of last modification */ + timespec_t va_ctime; /* time file ``created'' */ + dev_t va_rdev; /* device the file represents */ + u_long va_blksize; /* fundamental block size */ + __int64_t va_nblocks; /* # of blocks allocated */ + u_long va_vcode; /* version code */ + u_long va_xflags; /* random extended file flags */ + u_long va_extsize; /* file extent size */ + u_long va_nextents; /* number of extents in file */ + u_long va_anextents; /* number of attr extents in file */ + int va_projid; /* project id */ + u_int va_gencount; /* object generation count */ +} vattr_t; + +/* + * setattr or getattr attributes + */ +#define AT_TYPE 0x00000001 +#define AT_MODE 0x00000002 +#define AT_UID 0x00000004 +#define AT_GID 0x00000008 +#define AT_FSID 0x00000010 +#define AT_NODEID 0x00000020 +#define AT_NLINK 0x00000040 +#define AT_SIZE 0x00000080 +#define AT_ATIME 0x00000100 +#define AT_MTIME 0x00000200 +#define AT_CTIME 0x00000400 +#define AT_RDEV 0x00000800 +#define AT_BLKSIZE 0x00001000 +#define AT_NBLOCKS 0x00002000 +#define AT_VCODE 0x00004000 +#define AT_MAC 0x00008000 +#define AT_UPDATIME 0x00010000 +#define AT_UPDMTIME 0x00020000 +#define AT_UPDCTIME 0x00040000 +#define AT_ACL 0x00080000 +#define AT_CAP 0x00100000 +#define AT_INF 0x00200000 +#define AT_XFLAGS 0x00400000 +#define AT_EXTSIZE 0x00800000 +#define AT_NEXTENTS 0x01000000 +#define AT_ANEXTENTS 0x02000000 +#define AT_PROJID 0x04000000 +#define AT_SIZE_NOPERM 0x08000000 +#define AT_GENCOUNT 0x10000000 + +#define AT_ALL (AT_TYPE|AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|\ + AT_NLINK|AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|AT_RDEV|\ + AT_BLKSIZE|AT_NBLOCKS|AT_VCODE|AT_MAC|AT_ACL|AT_CAP|\ + AT_INF|AT_XFLAGS|AT_EXTSIZE|AT_NEXTENTS|AT_ANEXTENTS|\ + AT_PROJID|AT_GENCOUNT) + +#define AT_STAT (AT_TYPE|AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|AT_NLINK|\ + AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|AT_RDEV|AT_BLKSIZE|\ + AT_NBLOCKS|AT_PROJID) + +#define AT_TIMES (AT_ATIME|AT_MTIME|AT_CTIME) + +#define AT_UPDTIMES (AT_UPDATIME|AT_UPDMTIME|AT_UPDCTIME) + +#define AT_NOSET (AT_NLINK|AT_RDEV|AT_FSID|AT_NODEID|AT_TYPE|\ + AT_BLKSIZE|AT_NBLOCKS|AT_VCODE|AT_NEXTENTS|AT_ANEXTENTS|\ + AT_GENCOUNT) + +#define VREAD 00400 +#define VWRITE 00200 +#define VEXEC 00100 +#define VSGID 02000 /* set group id on execution */ +#define MODEMASK 07777 /* mode bits plus permission bits */ + +/* + * Check whether mandatory file locking is enabled. + */ +#define MANDLOCK(vp, mode) \ + ((vp)->v_type == VREG && ((mode) & (VSGID|(VEXEC>>3))) == VSGID) + +extern void vn_init(void); +extern int vn_wait(struct vnode *); +extern vnode_t *vn_initialize(struct inode *); + +/* + * Acquiring and invalidating vnodes: + * + * if (vn_get(vp, version, 0)) + * ...; + * vn_purge(vp, version); + * + * vn_get and vn_purge must be called with vmap_t arguments, sampled + * while a lock that the vnode's VOP_RECLAIM function acquires is + * held, to ensure that the vnode sampled with the lock held isn't + * recycled (VOP_RECLAIMed) or deallocated between the release of the lock + * and the subsequent vn_get or vn_purge. + */ + +/* + * vnode_map structures _must_ match vn_epoch and vnode structure sizes. + */ +typedef struct vnode_map { + vfs_t *v_vfsp; + vnumber_t v_number; /* in-core vnode number */ + xfs_ino_t v_ino; /* inode # */ +} vmap_t; + +#define VMAP(vp, ip, vmap) {(vmap).v_vfsp = (vp)->v_vfsp, \ + (vmap).v_number = (vp)->v_number, \ + (vmap).v_ino = (ip)->i_ino; } +extern void vn_purge(struct vnode *, vmap_t *); +extern vnode_t *vn_get(struct vnode *, vmap_t *); +extern int vn_revalidate(struct vnode *, int); +extern void vn_remove(struct vnode *); + +static inline int vn_count(struct vnode *vp) +{ + return atomic_read(&LINVFS_GET_IP(vp)->i_count); +} + +/* + * Vnode reference counting functions (and macros for compatibility). + */ +extern vnode_t *vn_hold(struct vnode *); +extern void vn_rele(struct vnode *); + +#if defined(CONFIG_XFS_VNODE_TRACING) + +#define VN_HOLD(vp) \ + ((void)vn_hold(vp), \ + vn_trace_hold(vp, __FILE__, __LINE__, (inst_t *)__return_address)) +#define VN_RELE(vp) \ + (vn_trace_rele(vp, __FILE__, __LINE__, (inst_t *)__return_address), \ + iput(LINVFS_GET_IP(vp))) + +#else /* ! (defined(CONFIG_XFS_VNODE_TRACING)) */ + +#define VN_HOLD(vp) ((void)vn_hold(vp)) +#define VN_RELE(vp) (iput(LINVFS_GET_IP(vp))) + +#endif /* ! (defined(CONFIG_XFS_VNODE_TRACING) */ + +/* + * Vnode spinlock manipulation. + */ +#define VN_FLAGSET(vp,b) vn_flagset(vp,b) +#define VN_FLAGCLR(vp,b) vn_flagclr(vp,b) + +static __inline__ void vn_flagset(struct vnode *vp, uint flag) +{ + spin_lock(&vp->v_lock); + vp->v_flag |= flag; + spin_unlock(&vp->v_lock); +} + +static __inline__ void vn_flagclr(struct vnode *vp, uint flag) +{ + spin_lock(&vp->v_lock); + vp->v_flag &= ~flag; + spin_unlock(&vp->v_lock); +} + +/* + * Some useful predicates. + */ +#define VN_MAPPED(vp) ((LINVFS_GET_IP(vp)->i_mapping->i_mmap != NULL) || \ + (LINVFS_GET_IP(vp)->i_mapping->i_mmap_shared != NULL)) +#define VN_CACHED(vp) (LINVFS_GET_IP(vp)->i_mapping->nrpages) +#define VN_DIRTY(vp) (!list_empty(&(LINVFS_GET_IP(vp)->i_dirty_data_buffers))) +#define VMODIFY(vp) { VN_FLAGSET(vp, VMODIFIED); \ + mark_inode_dirty(LINVFS_GET_IP(vp)); } +#define VUNMODIFY(vp) VN_FLAGCLR(vp, VMODIFIED) + +/* + * Flags to VOP_SETATTR/VOP_GETATTR. + */ +#define ATTR_UTIME 0x01 /* non-default utime(2) request */ +#define ATTR_EXEC 0x02 /* invocation from exec(2) */ +#define ATTR_COMM 0x04 /* yield common vp attributes */ +#define ATTR_DMI 0x08 /* invocation from a DMI function */ +#define ATTR_LAZY 0x80 /* set/get attributes lazily */ +#define ATTR_NONBLOCK 0x100 /* return EAGAIN if operation would block */ +#define ATTR_NOLOCK 0x200 /* Don't grab any conflicting locks */ +#define ATTR_NOSIZETOK 0x400 /* Don't get the DVN_SIZE_READ token */ + +/* + * Flags to VOP_FSYNC and VOP_RECLAIM. + */ +#define FSYNC_NOWAIT 0 /* asynchronous flush */ +#define FSYNC_WAIT 0x1 /* synchronous fsync or forced reclaim */ +#define FSYNC_INVAL 0x2 /* flush and invalidate cached data */ +#define FSYNC_DATA 0x4 /* synchronous fsync of data only */ + +#if (defined(CONFIG_XFS_VNODE_TRACING)) + +#define VNODE_TRACE_SIZE 16 /* number of trace entries */ + +/* + * Tracing entries. + */ +#define VNODE_KTRACE_ENTRY 1 +#define VNODE_KTRACE_EXIT 2 +#define VNODE_KTRACE_HOLD 3 +#define VNODE_KTRACE_REF 4 +#define VNODE_KTRACE_RELE 5 + +extern void vn_trace_entry(struct vnode *, char *, inst_t *); +extern void vn_trace_exit(struct vnode *, char *, inst_t *); +extern void vn_trace_hold(struct vnode *, char *, int, inst_t *); +extern void vn_trace_ref(struct vnode *, char *, int, inst_t *); +extern void vn_trace_rele(struct vnode *, char *, int, inst_t *); +#define VN_TRACE(vp) \ + vn_trace_ref(vp, __FILE__, __LINE__, (inst_t *)__return_address) + +#else /* ! (defined(CONFIG_XFS_VNODE_TRACING)) */ + +#define vn_trace_entry(a,b,c) +#define vn_trace_exit(a,b,c) +#define vn_trace_hold(a,b,c,d) +#define vn_trace_ref(a,b,c,d) +#define vn_trace_rele(a,b,c,d) +#define VN_TRACE(vp) + +#endif /* ! (defined(CONFIG_XFS_VNODE_TRACING)) */ + +#endif /* __XFS_VNODE_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/pagebuf/Makefile linux-2.4-xfs/fs/xfs/pagebuf/Makefile --- linux-2.4.19/fs/xfs/pagebuf/Makefile Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/pagebuf/Makefile Mon Sep 2 16:46:33 2002 @@ -0,0 +1,56 @@ +# +# Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write the Free Software Foundation, Inc., 59 +# Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, +# Mountain View, CA 94043, or: +# +# http://www.sgi.com +# +# For further information regarding this notice, see: +# +# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ +# +# Makefile for the linux pagebuf routines. +# + +# Debug options: +# -DPAGEBUF_TRACKING +# -DPAGEBUF_LOCKING +# -DPAGEBUF_DEBUG +# -DPAGEBUF_TRACE +ifeq ($(CONFIG_PAGEBUF_DEBUG),y) + EXTRA_CFLAGS += -g -DSTATIC="" \ + -DPAGEBUF_DEBUG -DPAGEBUF_TRACKING -DPAGEBUF_TRACE +endif +EXTRA_CFLAGS += -I.. + +O_TARGET := pagebuf.o + +ifneq ($(MAKECMDGOALS),modules_install) + obj-m := $(O_TARGET) +endif + +export-objs += page_buf.o +obj-y += page_buf.o \ + page_buf_locking.o + +include $(TOPDIR)/Rules.make diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/pagebuf/Makefile.in linux-2.4-xfs/fs/xfs/pagebuf/Makefile.in --- linux-2.4.19/fs/xfs/pagebuf/Makefile.in Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/pagebuf/Makefile.in Wed Jul 10 23:14:43 2002 @@ -0,0 +1,50 @@ +# +# Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write the Free Software Foundation, Inc., 59 +# Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, +# Mountain View, CA 94043, or: +# +# http://www.sgi.com +# +# For further information regarding this notice, see: +# +# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ +# +# Makefile for the linux pagebuf routines. +# + +expsyms(page_buf.o) +objlink(pagebuf.o page_buf.o page_buf_io.o page_buf_locking.o) +# No select() for pagebuf.o in this directory. It is a sub-component of XFS, +# see fs/xfs/Makefile.in for the objlink. + +# Debug options: +# -DPAGEBUF_TRACKING +# -DPAGEBUF_LOCKING +# -DPAGEBUF_DEBUG +# -DPAGEBUF_TRACE +ifsel(CONFIG_PAGEBUF_DEBUG) + extra_cflags_all(-g -DSTATIC="" -DPAGEBUF_DEBUG -DPAGEBUF_TRACKING -DPAGEBUF_TRACE) +endif + +# FIXME: page_buf.c does #include +extra_cflags(page_buf.o $(src_includelist /fs/xfs)) diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/pagebuf/diff linux-2.4-xfs/fs/xfs/pagebuf/diff --- linux-2.4.19/fs/xfs/pagebuf/diff Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/pagebuf/diff Tue Aug 6 20:10:11 2002 @@ -0,0 +1,232 @@ +? .depend +? .page_buf.o.flags +? .page_buf_io.o.flags +? .page_buf_locking.o.flags +? diff +? .pagebuf.o.flags +? page_buf_inline.h +Index: page_buf.c +=================================================================== +RCS file: /cvs/linux-2.4-xfs/linux/fs/xfs/pagebuf/page_buf.c,v +retrieving revision 1.48 +diff -u -p -r1.48 page_buf.c +--- page_buf.c 2002/08/02 20:09:37 1.48 ++++ page_buf.c 2002/08/06 17:09:39 +@@ -239,8 +239,6 @@ _bhash( + * Mapping of multi-page buffers into contingous virtual space + */ + +-STATIC void *pagebuf_mapout_locked(page_buf_t *); +- + STATIC spinlock_t as_lock = SPIN_LOCK_UNLOCKED; + typedef struct a_list { + void *vm_addr; +@@ -249,40 +247,6 @@ typedef struct a_list { + STATIC a_list_t *as_free_head; + STATIC int as_list_len; + +-STATIC void +-free_address(void *addr) +-{ +- a_list_t *aentry; +- +- spin_lock(&as_lock); +- aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC); +- aentry->next = as_free_head; +- aentry->vm_addr = addr; +- as_free_head = aentry; +- as_list_len++; +- spin_unlock(&as_lock); +-} +- +-STATIC void +-purge_addresses(void) +-{ +- a_list_t *aentry, *old; +- +- if (as_free_head == NULL) return; +- +- spin_lock(&as_lock); +- aentry = as_free_head; +- as_free_head = NULL; +- as_list_len = 0; +- spin_unlock(&as_lock); +- +- while ((old = aentry) != NULL) { +- vfree(aentry->vm_addr); +- aentry = aentry->next; +- kfree(old); +- } +-} +- + /* + * Locking model: + * +@@ -340,6 +304,88 @@ _pagebuf_initialize( + return (0); + } + ++STATIC void ++_pagebuf_purge_addresses(void) ++{ ++ a_list_t *aentry, *old; ++ ++ if (as_free_head == NULL) ++ return; ++ ++ spin_lock(&as_lock); ++ aentry = as_free_head; ++ as_free_head = NULL; ++ as_list_len = 0; ++ spin_unlock(&as_lock); ++ ++ while ((old = aentry) != NULL) { ++ vfree(aentry->vm_addr); ++ aentry = aentry->next; ++ kfree(old); ++ } ++} ++ ++STATIC int ++_pagebuf_map(page_buf_t *pb, int count, int flags, int gfp_mask) ++{ ++ if (count == 1) { ++ /* ++ * A single page buffer is always mappable, thus we mark it ++ * mapped even if we weren't requested to do so explicitly. ++ * ++ * This might save some expensive calculations later on. ++ */ ++ ++ pb->pb_addr = (page_address(pb->pb_pages[0]) + pb->pb_offset); ++ pb->pb_flags |= PBF_MAPPED; ++ } else if (flags & PBF_MAPPED) { ++ /* ++ * For multipage buffers mapping is very expensive as it ++ * requires changes to the kernel virtaul address space. ++ * ++ * We do this only if we really need to. ++ */ ++ void *vaddr; ++ ++ if (as_list_len > 64) ++ _pagebuf_purge_addresses(); ++ ++ vaddr = remap_page_array(pb->pb_pages, count, gfp_mask); ++ if (unlikely(!vaddr)) ++ return -ENOMEM; ++ ++ pb->pb_addr = (vaddr + pb->pb_offset); ++ pb->pb_flags |= _PBF_ADDR_ALLOCATED; ++ } ++ ++ return 0; ++} ++ ++STATIC void ++_pagebuf_unmap(page_buf_t *pb) ++{ ++ if (pb->pb_flags & _PBF_ADDR_ALLOCATED) { ++ void *addr = (pb->pb_addr - pb->pb_offset); ++ a_list_t *aentry = kmalloc(sizeof(*aentry), GFP_ATOMIC); ++ ++ /* ++ * If we can't allocate a list entry we have to free now. ++ */ ++ if (aentry) { ++ spin_lock(&as_lock); ++ aentry->vm_addr = addr; ++ as_free_head = aentry; ++ aentry->next = as_free_head; ++ as_list_len++; ++ spin_unlock(&as_lock); ++ } else ++ vfree(addr); ++ } ++ ++ pb->pb_addr = NULL; ++ pb->pb_flags &= ~(PBF_MAPPED | _PBF_ADDR_ALLOCATED); ++} ++ + /* + * Allocate a page array capable of holding a specified number + * of pages, and point the page buf at it. +@@ -415,11 +461,8 @@ void _pagebuf_free_object( + + if (!(pb_flags & PBF_FREED)) { + /* release any virtual mapping */ ; +- if (pb->pb_flags & _PBF_ADDR_ALLOCATED) { +- void *vaddr = pagebuf_mapout_locked(pb); +- if (vaddr) { +- free_address(vaddr); +- } ++ if (pb->pb_flags & PBF_MAPPED) { ++ _pagebuf_unmap(pb); + } + + if (pb->pb_flags & _PBF_MEM_ALLOCATED) { +@@ -578,23 +621,12 @@ mapit: + pb->pb_flags |= _PBF_MEM_ALLOCATED; + if (all_mapped) { + pb->pb_flags |= _PBF_ALL_PAGES_MAPPED; +- /* A single page buffer is always mappable */ +- if (page_count == 1) { +- pb->pb_addr = +- (caddr_t) page_address(pb->pb_pages[0]) + +- pb->pb_offset; +- pb->pb_flags |= PBF_MAPPED; +- } else if (flags & PBF_MAPPED) { +- if (as_list_len > 64) +- purge_addresses(); +- pb->pb_addr = remap_page_array(pb->pb_pages, +- page_count, gfp_mask); +- if (!pb->pb_addr) +- BUG(); +- pb->pb_addr += pb->pb_offset; +- pb->pb_flags |= PBF_MAPPED | _PBF_ADDR_ALLOCATED; +- } ++ ++ /* TODO(hch): handle this error case properly. */ ++ if (unlikely(_pagebuf_map(pb, page_count, flags, gfp_mask))) ++ BUG(); + } ++ + /* If some pages were found with data in them + * we are not in PBF_NONE state. + */ +@@ -1830,26 +1862,6 @@ int pagebuf_iowait(page_buf_t * pb) /* b + } + + +-/* reverse pagebuf_mapin() */ +-STATIC void * +-pagebuf_mapout_locked( +- page_buf_t * pb) /* buffer to unmap */ +-{ +- void *old_addr = NULL; +- +- if (pb->pb_flags & PBF_MAPPED) { +- if (pb->pb_flags & _PBF_ADDR_ALLOCATED) +- old_addr = pb->pb_addr - pb->pb_offset; +- pb->pb_addr = NULL; +- pb->pb_flags &= ~(PBF_MAPPED | _PBF_ADDR_ALLOCATED); +- } +- +- return (old_addr); /* Caller must free the address space, +- * we are under a spin lock, probably +- * not safe to do vfree here +- */ +-} +- + caddr_t + pagebuf_offset(page_buf_t *pb, off_t offset) + { +@@ -2189,7 +2201,7 @@ pagebuf_daemon(void *data) + if (count) + run_task_queue(&tq_disk); + if (as_list_len > 0) +- purge_addresses(); ++ _pagebuf_purge_addresses(); + + force_flush = 0; + } while (pb_daemon->active == 1); diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/pagebuf/page_buf.c linux-2.4-xfs/fs/xfs/pagebuf/page_buf.c --- linux-2.4.19/fs/xfs/pagebuf/page_buf.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/pagebuf/page_buf.c Thu Aug 29 03:39:02 2002 @@ -0,0 +1,2524 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * page_buf.c + * + * The page_buf module provides an abstract buffer cache model on top of + * the Linux page cache. Cached blocks for a file are hashed to the + * inode for that file, and can be held dirty in delayed write mode in + * the page cache. Cached metadata blocks for a file system are hashed + * to the inode for the mounted device. The page_buf module assembles + * buffer (page_buf_t) objects on demand to aggregate such cached pages + * for I/O. + * + * + * Written by Steve Lord, Jim Mostek, Russell Cattelan + * and Rajagopal Ananthanarayanan ("ananth") at SGI. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "page_buf_internal.h" + +#define SECTOR_SHIFT 9 +#define SECTOR_SIZE (1<pb_flags; + pb_trace.buf[j].hold = pb->pb_hold.counter; + pb_trace.buf[j].lock_value = PBP(pb)->pb_sema.count.counter; + pb_trace.buf[j].task = (void *)current; + pb_trace.buf[j].misc = misc; + pb_trace.buf[j].ra = ra; + pb_trace.buf[j].offset = pb->pb_file_offset; + pb_trace.buf[j].size = pb->pb_buffer_length; +} +#endif /* PAGEBUF_TRACE */ + +#ifdef PAGEBUF_TRACKING +#define MAX_PB 10000 +page_buf_t *pb_array[MAX_PB]; +EXPORT_SYMBOL(pb_array); + +void +pb_tracking_get( + page_buf_t *pb) +{ + int i; + + for (i = 0; (pb_array[i] != 0) && (i < MAX_PB); i++) { } + if (i == MAX_PB) + printk("pb 0x%p not recorded in pb_array\n", pb); + else { + //printk("pb_get 0x%p in pb_array[%d]\n", pb, i); + pb_array[i] = pb; + } +} + +void +pb_tracking_free( + page_buf_t *pb) +{ + int i; + + for (i = 0; (pb_array[i] != pb) && (i < MAX_PB); i++) { } + if (i < MAX_PB) { + //printk("pb_free 0x%p from pb_array[%d]\n", pb, i); + pb_array[i] = NULL; + } + else + printk("Freed unmonitored pagebuf 0x%p\n", pb); +} +#else +#define pb_tracking_get(pb) do { } while (0) +#define pb_tracking_free(pb) do { } while (0) +#endif /* PAGEBUF_TRACKING */ + +/* + * File wide globals + */ + +STATIC kmem_cache_t *pagebuf_cache; +STATIC pagebuf_daemon_t *pb_daemon; +STATIC struct list_head pagebuf_iodone_tq[NR_CPUS]; +STATIC wait_queue_head_t pagebuf_iodone_wait[NR_CPUS]; + +/* + * For pre-allocated buffer head pool + */ + +#define NR_RESERVED_BH 64 +static wait_queue_head_t pb_resv_bh_wait; +static spinlock_t pb_resv_bh_lock = SPIN_LOCK_UNLOCKED; +struct buffer_head *pb_resv_bh = NULL; /* list of bh */ +int pb_resv_bh_cnt = 0; /* # of bh available */ + +STATIC void pagebuf_daemon_wakeup(int); +STATIC int _pagebuf_segment_apply(page_buf_t *); + +/* + * Pagebuf module configuration parameters, exported via + * /proc/sys/vm/pagebuf + */ + +unsigned long pagebuf_min[P_PARAM] = { HZ/2, 1*HZ, 0, 0 }; +unsigned long pagebuf_max[P_PARAM] = { HZ*30, HZ*300, 1, 1 }; + +pagebuf_param_t pb_params = {{ HZ, 15 * HZ, 0, 0 }}; + +/* + * Pagebuf statistics variables + */ + +struct pbstats pbstats; + +/* + * Pagebuf allocation / freeing. + */ + +#define pb_to_gfp(flags) \ + (((flags) & PBF_READ_AHEAD) ? GFP_READAHEAD : \ + ((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) + +#define pagebuf_allocate(flags) \ + kmem_cache_alloc(pagebuf_cache, pb_to_gfp(flags)) +#define pagebuf_deallocate(pb) \ + kmem_cache_free(pagebuf_cache, (pb)); + +/* + * Pagebuf hashing + */ + +#define NBITS 5 +#define NHASH (1<pb_hash_index] + +STATIC int +_bhash( + dev_t dev, + loff_t base) +{ + int bit, hval; + + base >>= 9; + /* + * dev_t is 16 bits, loff_t is always 64 bits + */ + base ^= dev; + for (bit = hval = 0; base != 0 && bit < sizeof(base) * 8; bit += NBITS) { + hval ^= (int)base & (NHASH-1); + base >>= NBITS; + } + return hval; +} + +/* + * Mapping of multi-page buffers into contingous virtual space + */ + +STATIC void *pagebuf_mapout_locked(page_buf_t *); + +STATIC spinlock_t as_lock = SPIN_LOCK_UNLOCKED; +typedef struct a_list { + void *vm_addr; + struct a_list *next; +} a_list_t; +STATIC a_list_t *as_free_head; +STATIC int as_list_len; + +STATIC void +free_address( + void *addr) +{ + a_list_t *aentry; + + spin_lock(&as_lock); + aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC); + aentry->next = as_free_head; + aentry->vm_addr = addr; + as_free_head = aentry; + as_list_len++; + spin_unlock(&as_lock); +} + +STATIC void +purge_addresses(void) +{ + a_list_t *aentry, *old; + + if (as_free_head == NULL) return; + + spin_lock(&as_lock); + aentry = as_free_head; + as_free_head = NULL; + as_list_len = 0; + spin_unlock(&as_lock); + + while ((old = aentry) != NULL) { + vunmap(aentry->vm_addr); + aentry = aentry->next; + kfree(old); + } +} + +/* + * Locking model: + * + * Buffers associated with inodes for which buffer locking + * is not enabled are not protected by semaphores, and are + * assumed to be exclusively owned by the caller. There is + * spinlock in the buffer, for use by the caller when concurrent + * access is possible. + */ + +/* + * Internal pagebuf object manipulation + */ + +STATIC void +_pagebuf_initialize( + page_buf_t *pb, + pb_target_t *target, + loff_t range_base, + size_t range_length, + page_buf_flags_t flags) +{ + /* + * We don't want certain flags to appear in pb->pb_flags. + */ + flags &= ~(PBF_LOCK|PBF_ENTER_PAGES|PBF_MAPPED); + flags &= ~(PBF_DONT_BLOCK|PBF_READ_AHEAD); + + pb_tracking_get(pb); + + memset(pb, 0, sizeof(page_buf_private_t)); + atomic_set(&pb->pb_hold, 1); + init_MUTEX_LOCKED(&pb->pb_iodonesema); + INIT_LIST_HEAD(&pb->pb_list); + INIT_LIST_HEAD(&pb->pb_hash_list); + init_MUTEX_LOCKED(&PBP(pb)->pb_sema); /* held, no waiters */ + PB_SET_OWNER(pb); + pb->pb_target = target; + pb->pb_file_offset = range_base; + /* + * Set buffer_length and count_desired to the same value initially. + * IO routines should use count_desired, which will be the same in + * most cases but may be reset (e.g. XFS recovery). + */ + pb->pb_buffer_length = pb->pb_count_desired = range_length; + pb->pb_flags = flags | PBF_NONE; + pb->pb_bn = PAGE_BUF_DADDR_NULL; + atomic_set(&PBP(pb)->pb_pin_count, 0); + init_waitqueue_head(&PBP(pb)->pb_waiters); + + PB_STATS_INC(pbstats.pb_create); + PB_TRACE(pb, PB_TRACE_REC(get), target); +} + +/* + * Allocate a page array capable of holding a specified number + * of pages, and point the page buf at it. + */ +STATIC int +_pagebuf_get_pages( + page_buf_t *pb, + int page_count, + int flags) +{ + int gpf_mask = pb_to_gfp(flags); + + /* Make sure that we have a page list */ + if (pb->pb_pages == NULL) { + pb->pb_offset = page_buf_poff(pb->pb_file_offset); + pb->pb_page_count = page_count; + if (page_count <= PB_PAGES) { + pb->pb_pages = pb->pb_page_array; + } else { + pb->pb_pages = kmalloc(sizeof(struct page *) * + page_count, gpf_mask); + if (pb->pb_pages == NULL) + return -ENOMEM; + } + memset(pb->pb_pages, 0, sizeof(struct page *) * page_count); + } + return 0; +} + +/* + * Walk a pagebuf releasing all the pages contained within it. + */ +STATIC inline void +_pagebuf_freepages( + page_buf_t *pb) +{ + int buf_index; + + for (buf_index = 0; buf_index < pb->pb_page_count; buf_index++) { + struct page *page = pb->pb_pages[buf_index]; + + if (page) { + pb->pb_pages[buf_index] = NULL; + page_cache_release(page); + } + } + + if (pb->pb_pages != pb->pb_page_array) + kfree(pb->pb_pages); +} + +/* + * _pagebuf_free_object + * + * _pagebuf_free_object releases the contents specified buffer. + * The modification state of any associated pages is left unchanged. + */ +void +_pagebuf_free_object( + pb_hash_t *hash, /* hash bucket for buffer */ + page_buf_t *pb) /* buffer to deallocate */ +{ + int pb_flags = pb->pb_flags; + + PB_TRACE(pb, PB_TRACE_REC(free_obj), 0); + pb->pb_flags |= PBF_FREED; + + if (hash) { + if (!list_empty(&pb->pb_hash_list)) { + hash->pb_count--; + list_del_init(&pb->pb_hash_list); + } + spin_unlock(&hash->pb_hash_lock); + } + + if (!(pb_flags & PBF_FREED)) { + /* release any virtual mapping */ ; + if (pb->pb_flags & _PBF_ADDR_ALLOCATED) { + void *vaddr = pagebuf_mapout_locked(pb); + if (vaddr) { + free_address(vaddr); + } + } + + if (pb->pb_flags & _PBF_MEM_ALLOCATED) { + if (pb->pb_pages) { + /* release the pages in the address list */ + if (pb->pb_pages[0] && + PageSlab(pb->pb_pages[0])) { + /* + * This came from the slab + * allocator free it as such + */ + kfree(pb->pb_addr); + } else { + _pagebuf_freepages(pb); + } + + pb->pb_pages = NULL; + } + pb->pb_flags &= ~_PBF_MEM_ALLOCATED; + } + } + + pb_tracking_free(pb); + pagebuf_deallocate(pb); +} + +/* + * _pagebuf_lookup_pages + * + * _pagebuf_lookup_pages finds all pages which match the buffer + * in question and the range of file offsets supplied, + * and builds the page list for the buffer, if the + * page list is not already formed or if not all of the pages are + * already in the list. Invalid pages (pages which have not yet been + * read in from disk) are assigned for any pages which are not found. + */ +STATIC int +_pagebuf_lookup_pages( + page_buf_t *pb, + struct address_space *aspace, + page_buf_flags_t flags) +{ + loff_t next_buffer_offset; + unsigned long page_count, pi, index; + struct page *page; + int gfp_mask, retry_count = 5, rval = 0; + int all_mapped, good_pages; + size_t blocksize; + + /* For pagebufs where we want to map an address, do not use + * highmem pages - so that we do not need to use kmap resources + * to access the data. + * + * For pages where the caller has indicated there may be resource + * contention (e.g. called from a transaction) do not flush + * delalloc pages to obtain memory. + */ + + if (flags & PBF_READ_AHEAD) { + gfp_mask = GFP_READAHEAD; + retry_count = 0; + } else if (flags & PBF_DONT_BLOCK) { + gfp_mask = GFP_NOFS; + } else if (flags & PBF_MAPPABLE) { + gfp_mask = GFP_KERNEL; + } else { + gfp_mask = GFP_HIGHUSER; + } + + next_buffer_offset = pb->pb_file_offset + pb->pb_buffer_length; + + good_pages = page_count = (page_buf_btoc(next_buffer_offset) - + page_buf_btoct(pb->pb_file_offset)); + + if (pb->pb_flags & _PBF_ALL_PAGES_MAPPED) { + /* Bring pages forward in cache */ + for (pi = 0; pi < page_count; pi++) { + mark_page_accessed(pb->pb_pages[pi]); + } + if ((flags & PBF_MAPPED) && !(pb->pb_flags & PBF_MAPPED)) { + all_mapped = 1; + goto mapit; + } + return 0; + } + + /* Ensure pb_pages field has been initialised */ + rval = _pagebuf_get_pages(pb, page_count, flags); + if (rval) + return rval; + + rval = pi = 0; + blocksize = pb->pb_target->pbr_blocksize; + + /* Enter the pages in the page list */ + index = (pb->pb_file_offset - pb->pb_offset) >> PAGE_CACHE_SHIFT; + for (all_mapped = 1; pi < page_count; pi++, index++) { + if (pb->pb_pages[pi] == 0) { + retry: + page = find_or_create_page(aspace, index, gfp_mask); + if (!page) { + if (--retry_count > 0) { + PB_STATS_INC(pbstats.pb_page_retries); + pagebuf_daemon_wakeup(1); + current->state = TASK_UNINTERRUPTIBLE; + schedule_timeout(10); + goto retry; + } + rval = -ENOMEM; + all_mapped = 0; + continue; + } + PB_STATS_INC(pbstats.pb_page_found); + mark_page_accessed(page); + pb->pb_pages[pi] = page; + } else { + page = pb->pb_pages[pi]; + lock_page(page); + } + + /* If we need to do I/O on a page record the fact */ + if (!Page_Uptodate(page)) { + good_pages--; + if ((blocksize == PAGE_CACHE_SIZE) && + (flags & PBF_READ)) + pb->pb_locked = 1; + } + } + + if (!pb->pb_locked) { + for (pi = 0; pi < page_count; pi++) { + unlock_page(pb->pb_pages[pi]); + } + } + +mapit: + pb->pb_flags |= _PBF_MEM_ALLOCATED; + if (all_mapped) { + pb->pb_flags |= _PBF_ALL_PAGES_MAPPED; + + /* A single page buffer is always mappable */ + if (page_count == 1) { + pb->pb_addr = (caddr_t) + page_address(pb->pb_pages[0]) + pb->pb_offset; + pb->pb_flags |= PBF_MAPPED; + } else if (flags & PBF_MAPPED) { + if (as_list_len > 64) + purge_addresses(); + pb->pb_addr = vmap(pb->pb_pages, page_count); + if (!pb->pb_addr) + BUG(); + pb->pb_addr += pb->pb_offset; + pb->pb_flags |= PBF_MAPPED | _PBF_ADDR_ALLOCATED; + } + } + /* If some pages were found with data in them + * we are not in PBF_NONE state. + */ + if (good_pages != 0) { + pb->pb_flags &= ~(PBF_NONE); + if (good_pages != page_count) { + pb->pb_flags |= PBF_PARTIAL; + } + } + + PB_TRACE(pb, PB_TRACE_REC(look_pg), good_pages); + + return rval; +} + + +/* + * Pre-allocation of a pool of buffer heads for use in + * low-memory situations. + */ + +/* + * _pagebuf_prealloc_bh + * + * Pre-allocate a pool of "count" buffer heads at startup. + * Puts them on a list at "pb_resv_bh" + * Returns number of bh actually allocated to pool. + */ +STATIC int +_pagebuf_prealloc_bh( + int count) +{ + struct buffer_head *bh; + int i; + + for (i = 0; i < count; i++) { + bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL); + if (!bh) + break; + bh->b_pprev = &pb_resv_bh; + bh->b_next = pb_resv_bh; + pb_resv_bh = bh; + pb_resv_bh_cnt++; + } + return i; +} + +/* + * _pagebuf_get_prealloc_bh + * + * Get one buffer head from our pre-allocated pool. + * If pool is empty, sleep 'til one comes back in. + * Returns aforementioned buffer head. + */ +STATIC struct buffer_head * +_pagebuf_get_prealloc_bh(void) +{ + unsigned long flags; + struct buffer_head *bh = NULL; + struct task_struct *tsk = current; + DECLARE_WAITQUEUE (wait, tsk); + + spin_lock_irqsave(&pb_resv_bh_lock, flags); + + if (pb_resv_bh_cnt < 1) { + + add_wait_queue(&pb_resv_bh_wait, &wait); + do { + run_task_queue(&tq_disk); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + spin_unlock_irqrestore(&pb_resv_bh_lock, flags); + schedule(); + spin_lock_irqsave(&pb_resv_bh_lock, flags); + } while (pb_resv_bh_cnt < 1); + tsk->state = TASK_RUNNING; + remove_wait_queue(&pb_resv_bh_wait, &wait); + } + + if (pb_resv_bh_cnt < 1) + BUG(); + + bh = pb_resv_bh; + + if (!bh) + BUG(); + + pb_resv_bh = bh->b_next; + bh->b_state = 0; + pb_resv_bh_cnt--; + + spin_unlock_irqrestore(&pb_resv_bh_lock, flags); + + return bh; +} + +/* + * _pagebuf_free_bh + * + * Take care of buffer heads that we're finished with. + * Call this instead of just kmem_cache_free(bh_cachep, bh) + * when you're done with a bh. + * + * If our pre-allocated pool is full, just free the buffer head. + * Otherwise, put it back in the pool, and wake up anybody + * waiting for one. + */ +STATIC inline void +_pagebuf_free_bh( + struct buffer_head *bh) +{ + unsigned long flags; + + if (pb_resv_bh_cnt == NR_RESERVED_BH){ + kmem_cache_free(bh_cachep, bh); + } else { + spin_lock_irqsave(&pb_resv_bh_lock, flags); + + bh->b_pprev = &pb_resv_bh; + bh->b_next = pb_resv_bh; + pb_resv_bh = bh; + pb_resv_bh_cnt++; + + if (waitqueue_active(&pb_resv_bh_wait)) { + wake_up(&pb_resv_bh_wait); + } + + spin_unlock_irqrestore(&pb_resv_bh_lock, flags); + } +} + +/* + * Finding and Reading Buffers + */ + +/* + * _pagebuf_find + * + * Looks up, and creates if absent, a lockable buffer for + * a given range of an inode. The buffer is returned + * locked. If other overlapping buffers exist, they are + * released before the new buffer is created and locked, + * which may imply that this call will block until those buffers + * are unlocked. No I/O is implied by this call. + */ +STATIC page_buf_t * +_pagebuf_find( /* find buffer for block */ + pb_target_t *target,/* target for block */ + loff_t ioff, /* starting offset of range */ + size_t isize, /* length of range */ + page_buf_flags_t flags, /* PBF_TRYLOCK */ + page_buf_t *new_pb)/* newly allocated buffer */ +{ + loff_t range_base; + size_t range_length; + int hval; + pb_hash_t *h; + struct list_head *p; + page_buf_t *pb; + int not_locked; + + range_base = (ioff << SECTOR_SHIFT); + range_length = (isize << SECTOR_SHIFT); + + hval = _bhash(target->pbr_bdev->bd_dev, range_base); + h = &pbhash[hval]; + + spin_lock(&h->pb_hash_lock); + list_for_each(p, &h->pb_hash) { + pb = list_entry(p, page_buf_t, pb_hash_list); + + if ((target == pb->pb_target) && + (pb->pb_file_offset == range_base) && + (pb->pb_buffer_length == range_length)) { + if (pb->pb_flags & PBF_FREED) + break; + /* If we look at something bring it to the + * front of the list for next time + */ + list_del(&pb->pb_hash_list); + list_add(&pb->pb_hash_list, &h->pb_hash); + goto found; + } + } + + /* No match found */ + if (new_pb) { + _pagebuf_initialize(new_pb, target, range_base, + range_length, flags | _PBF_LOCKABLE); + new_pb->pb_hash_index = hval; + h->pb_count++; + list_add(&new_pb->pb_hash_list, &h->pb_hash); + } else { + PB_STATS_INC(pbstats.pb_miss_locked); + } + + spin_unlock(&h->pb_hash_lock); + return (new_pb); + +found: + atomic_inc(&pb->pb_hold); + spin_unlock(&h->pb_hash_lock); + + /* Attempt to get the semaphore without sleeping, + * if this does not work then we need to drop the + * spinlock and do a hard attempt on the semaphore. + */ + not_locked = down_trylock(&PBP(pb)->pb_sema); + if (not_locked) { + if (!(flags & PBF_TRYLOCK)) { + /* wait for buffer ownership */ + PB_TRACE(pb, PB_TRACE_REC(get_lk), 0); + pagebuf_lock(pb); + PB_STATS_INC(pbstats.pb_get_locked_waited); + } else { + /* We asked for a trylock and failed, no need + * to look at file offset and length here, we + * know that this pagebuf at least overlaps our + * pagebuf and is locked, therefore our buffer + * either does not exist, or is this buffer + */ + + pagebuf_rele(pb); + PB_STATS_INC(pbstats.pb_busy_locked); + return (NULL); + } + } else { + /* trylock worked */ + PB_SET_OWNER(pb); + } + + if (pb->pb_flags & PBF_STALE) + pb->pb_flags &= PBF_MAPPABLE | \ + PBF_MAPPED | \ + _PBF_LOCKABLE | \ + _PBF_ALL_PAGES_MAPPED | \ + _PBF_SOME_INVALID_PAGES | \ + _PBF_ADDR_ALLOCATED | \ + _PBF_MEM_ALLOCATED; + PB_TRACE(pb, PB_TRACE_REC(got_lk), 0); + PB_STATS_INC(pbstats.pb_get_locked); + return (pb); +} + + +/* + * pagebuf_find + * + * pagebuf_find returns a buffer matching the specified range of + * data for the specified target, if any of the relevant blocks + * are in memory. The buffer may have unallocated holes, if + * some, but not all, of the blocks are in memory. Even where + * pages are present in the buffer, not all of every page may be + * valid. The file system may use pagebuf_segment to visit the + * various segments of the buffer. + */ +page_buf_t * +pagebuf_find( /* find buffer for block */ + /* if the block is in memory */ + pb_target_t *target,/* target for block */ + loff_t ioff, /* starting offset of range */ + size_t isize, /* length of range */ + page_buf_flags_t flags) /* PBF_TRYLOCK */ +{ + return _pagebuf_find(target, ioff, isize, flags, NULL); +} + +/* + * pagebuf_get + * + * pagebuf_get assembles a buffer covering the specified range. + * Some or all of the blocks in the range may be valid. The file + * system may use pagebuf_segment to visit the various segments + * of the buffer. Storage in memory for all portions of the + * buffer will be allocated, although backing storage may not be. + * If PBF_READ is set in flags, pagebuf_read + */ +page_buf_t * +pagebuf_get( /* allocate a buffer */ + pb_target_t *target,/* target for buffer */ + loff_t ioff, /* starting offset of range */ + size_t isize, /* length of range */ + page_buf_flags_t flags) /* PBF_TRYLOCK */ +{ + page_buf_t *pb, *new_pb; + int error; + + new_pb = pagebuf_allocate(flags); + if (unlikely(!new_pb)) + return (NULL); + + pb = _pagebuf_find(target, ioff, isize, flags, new_pb); + if (pb != new_pb) { + pagebuf_deallocate(new_pb); + if (unlikely(!pb)) + return (NULL); + } + + PB_STATS_INC(pbstats.pb_get); + + /* fill in any missing pages */ + error = _pagebuf_lookup_pages(pb, pb->pb_target->pbr_mapping, flags); + if (unlikely(error)) { + pagebuf_free(pb); + return (NULL); + } + + /* + * Always fill in the block number now, the mapped cases can do + * their own overlay of this later. + */ + pb->pb_bn = ioff; + pb->pb_count_desired = pb->pb_buffer_length; + + if (flags & PBF_READ) { + if (PBF_NOT_DONE(pb)) { + PB_TRACE(pb, PB_TRACE_REC(get_read), flags); + PB_STATS_INC(pbstats.pb_get_read); + pagebuf_iostart(pb, flags); + } else if (flags & PBF_ASYNC) { + /* + * Read ahead call which is already satisfied, + * drop the buffer + */ + if (flags & (PBF_LOCK | PBF_TRYLOCK)) + pagebuf_unlock(pb); + pagebuf_rele(pb); + return NULL; + } else { + /* We do not want read in the flags */ + pb->pb_flags &= ~PBF_READ; + } + } + + PB_TRACE(pb, PB_TRACE_REC(get_obj), flags); + return (pb); +} + +/* + * Create a pagebuf and populate it with pages from the address + * space of the passed in inode. + */ +page_buf_t * +pagebuf_lookup( + struct pb_target *target, + struct inode *inode, + loff_t ioff, + size_t isize, + int flags) +{ + page_buf_t *pb = NULL; + int status; + + flags |= _PBF_PRIVATE_BH; + pb = pagebuf_allocate(flags); + if (pb) { + _pagebuf_initialize(pb, target, ioff, isize, flags); + if (flags & PBF_ENTER_PAGES) { + status = _pagebuf_lookup_pages(pb, &inode->i_data, 0); + if (status != 0) { + pagebuf_free(pb); + return (NULL); + } + } + } + return pb; +} + +/* + * If we are not low on memory then do the readahead in a deadlock + * safe manner. + */ +void +pagebuf_readahead( + pb_target_t *target, + loff_t ioff, + size_t isize, + int flags) +{ + flags |= (PBF_TRYLOCK|PBF_READ|PBF_ASYNC|PBF_MAPPABLE|PBF_READ_AHEAD); + pagebuf_get(target, ioff, isize, flags); +} + +page_buf_t * +pagebuf_get_empty( + pb_target_t *target) +{ + page_buf_t *pb; + + pb = pagebuf_allocate(_PBF_LOCKABLE); + if (pb) + _pagebuf_initialize(pb, target, 0, 0, _PBF_LOCKABLE); + return pb; +} + +static inline struct page * +mem_to_page( + void *addr) +{ + if (((unsigned long)addr < VMALLOC_START) || + ((unsigned long)addr >= VMALLOC_END)) { + return virt_to_page(addr); + } else { + return vmalloc_to_page(addr); + } +} + +int +pagebuf_associate_memory( + page_buf_t *pb, + void *mem, + size_t len) +{ + int rval; + int i = 0; + size_t ptr; + size_t end, end_cur; + off_t offset; + int page_count; + + page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT; + offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK); + if (offset && (len > PAGE_CACHE_SIZE)) + page_count++; + + /* Free any previous set of page pointers */ + if (pb->pb_pages && (pb->pb_pages != pb->pb_page_array)) { + kfree(pb->pb_pages); + } + pb->pb_pages = NULL; + pb->pb_addr = mem; + + rval = _pagebuf_get_pages(pb, page_count, 0); + if (rval) + return rval; + + pb->pb_offset = offset; + ptr = (size_t) mem & PAGE_CACHE_MASK; + end = PAGE_CACHE_ALIGN((size_t) mem + len); + end_cur = end; + /* set up first page */ + pb->pb_pages[0] = mem_to_page(mem); + + ptr += PAGE_CACHE_SIZE; + pb->pb_page_count = ++i; + while (ptr < end) { + pb->pb_pages[i] = mem_to_page((void *)ptr); + pb->pb_page_count = ++i; + ptr += PAGE_CACHE_SIZE; + } + pb->pb_locked = 0; + + pb->pb_count_desired = pb->pb_buffer_length = len; + pb->pb_flags |= PBF_MAPPED | _PBF_PRIVATE_BH; + + return 0; +} + +page_buf_t * +pagebuf_get_no_daddr( + size_t len, + pb_target_t *target) +{ + int rval; + void *rmem = NULL; + int flags = _PBF_LOCKABLE | PBF_FORCEIO; + page_buf_t *pb; + size_t tlen = 0; + + if (len > 0x20000) + return(NULL); + + pb = pagebuf_allocate(flags); + if (!pb) + return NULL; + + _pagebuf_initialize(pb, target, 0, len, flags); + + do { + if (tlen == 0) { + tlen = len; /* first time */ + } else { + kfree(rmem); /* free the mem from the previous try */ + tlen <<= 1; /* double the size and try again */ + /* + printk( + "pb_get_no_daddr NOT block 0x%p mask 0x%p len %d\n", + rmem, ((size_t)rmem & (size_t)~SECTOR_MASK), + len); + */ + } + if ((rmem = kmalloc(tlen, GFP_KERNEL)) == 0) { + pagebuf_free(pb); + return NULL; + } + } while ((size_t)rmem != ((size_t)rmem & (size_t)~SECTOR_MASK)); + + if ((rval = pagebuf_associate_memory(pb, rmem, len)) != 0) { + kfree(rmem); + pagebuf_free(pb); + return NULL; + } + /* otherwise pagebuf_free just ignores it */ + pb->pb_flags |= _PBF_MEM_ALLOCATED; + up(&PBP(pb)->pb_sema); /* Return unlocked pagebuf */ + + PB_TRACE(pb, PB_TRACE_REC(no_daddr), rmem); + + return pb; +} + + +/* + * pagebuf_hold + * + * Increment reference count on buffer, to hold the buffer concurrently + * with another thread which may release (free) the buffer asynchronously. + * + * Must hold the buffer already to call this function. + */ +void +pagebuf_hold( + page_buf_t *pb) +{ + atomic_inc(&pb->pb_hold); + PB_TRACE(pb, PB_TRACE_REC(hold), 0); +} + +/* + * pagebuf_free + * + * pagebuf_free releases the specified buffer. The modification + * state of any associated pages is left unchanged. + */ +void +pagebuf_free( + page_buf_t *pb) +{ + if (pb->pb_flags & _PBF_LOCKABLE) { + pb_hash_t *h = pb_hash(pb); + + spin_lock(&h->pb_hash_lock); + _pagebuf_free_object(h, pb); + } else { + _pagebuf_free_object(NULL, pb); + } +} + +/* + * pagebuf_rele + * + * pagebuf_rele releases a hold on the specified buffer. If the + * the hold count is 1, pagebuf_rele calls pagebuf_free. + */ +void +pagebuf_rele( + page_buf_t *pb) +{ + pb_hash_t *h; + + PB_TRACE(pb, PB_TRACE_REC(rele), pb->pb_relse); + if (pb->pb_flags & _PBF_LOCKABLE) { + h = pb_hash(pb); + spin_lock(&h->pb_hash_lock); + } else { + h = NULL; + } + + if (atomic_dec_and_test(&pb->pb_hold)) { + int do_free = 1; + + if (pb->pb_relse) { + atomic_inc(&pb->pb_hold); + if (h) + spin_unlock(&h->pb_hash_lock); + (*(pb->pb_relse)) (pb); + do_free = 0; + } + if (pb->pb_flags & PBF_DELWRI) { + pb->pb_flags |= PBF_ASYNC; + atomic_inc(&pb->pb_hold); + if (h && do_free) + spin_unlock(&h->pb_hash_lock); + pagebuf_delwri_queue(pb, 0); + do_free = 0; + } else if (pb->pb_flags & PBF_FS_MANAGED) { + if (h) + spin_unlock(&h->pb_hash_lock); + do_free = 0; + } + + if (do_free) { + _pagebuf_free_object(h, pb); + } + } else if (h) { + spin_unlock(&h->pb_hash_lock); + } +} + + +/* + * Pinning Buffer Storage in Memory + */ + +/* + * pagebuf_pin + * + * pagebuf_pin locks all of the memory represented by a buffer in + * memory. Multiple calls to pagebuf_pin and pagebuf_unpin, for + * the same or different buffers affecting a given page, will + * properly count the number of outstanding "pin" requests. The + * buffer may be released after the pagebuf_pin and a different + * buffer used when calling pagebuf_unpin, if desired. + * pagebuf_pin should be used by the file system when it wants be + * assured that no attempt will be made to force the affected + * memory to disk. It does not assure that a given logical page + * will not be moved to a different physical page. + */ +void +pagebuf_pin( + page_buf_t *pb) +{ + atomic_inc(&PBP(pb)->pb_pin_count); + PB_TRACE(pb, PB_TRACE_REC(pin), PBP(pb)->pb_pin_count.counter); +} + +/* + * pagebuf_unpin + * + * pagebuf_unpin reverses the locking of memory performed by + * pagebuf_pin. Note that both functions affected the logical + * pages associated with the buffer, not the buffer itself. + */ +void +pagebuf_unpin( + page_buf_t *pb) +{ + if (atomic_dec_and_test(&PBP(pb)->pb_pin_count)) { + wake_up_all(&PBP(pb)->pb_waiters); + } + PB_TRACE(pb, PB_TRACE_REC(unpin), PBP(pb)->pb_pin_count.counter); +} + +int +pagebuf_ispin( + page_buf_t *pb) +{ + return atomic_read(&PBP(pb)->pb_pin_count); +} + +/* + * pagebuf_wait_unpin + * + * pagebuf_wait_unpin waits until all of the memory associated + * with the buffer is not longer locked in memory. It returns + * immediately if none of the affected pages are locked. + */ +static inline void +_pagebuf_wait_unpin( + page_buf_t *pb) +{ + DECLARE_WAITQUEUE (wait, current); + + if (atomic_read(&PBP(pb)->pb_pin_count) == 0) + return; + + add_wait_queue(&PBP(pb)->pb_waiters, &wait); + for (;;) { + current->state = TASK_UNINTERRUPTIBLE; + if (atomic_read(&PBP(pb)->pb_pin_count) == 0) { + break; + } + run_task_queue(&tq_disk); + schedule(); + } + remove_wait_queue(&PBP(pb)->pb_waiters, &wait); + current->state = TASK_RUNNING; +} + +void +pagebuf_queue_task( + struct tq_struct *task) +{ + queue_task(task, &pagebuf_iodone_tq[smp_processor_id()]); + wake_up(&pagebuf_iodone_wait[smp_processor_id()]); +} + + +/* + * Buffer Utility Routines + */ + +/* + * pagebuf_iodone + * + * pagebuf_iodone marks a buffer for which I/O is in progress + * done with respect to that I/O. The pb_done routine, if + * present, will be called as a side-effect. + */ +void +pagebuf_iodone_sched( + void *v) +{ + page_buf_t *pb = (page_buf_t *)v; + + if (pb->pb_iodone) { + (*(pb->pb_iodone)) (pb); + return; + } + + if (pb->pb_flags & PBF_ASYNC) { + if ((pb->pb_flags & _PBF_LOCKABLE) && !pb->pb_relse) + pagebuf_unlock(pb); + pagebuf_rele(pb); + } +} + +void +pagebuf_iodone( + page_buf_t *pb) +{ + pb->pb_flags &= ~(PBF_READ | PBF_WRITE); + if (pb->pb_error == 0) { + pb->pb_flags &= ~(PBF_PARTIAL | PBF_NONE); + } + + PB_TRACE(pb, PB_TRACE_REC(done), pb->pb_iodone); + + if ((pb->pb_iodone) || (pb->pb_flags & PBF_ASYNC)) { + INIT_TQUEUE(&pb->pb_iodone_sched, + pagebuf_iodone_sched, (void *)pb); + + queue_task(&pb->pb_iodone_sched, + &pagebuf_iodone_tq[smp_processor_id()]); + wake_up(&pagebuf_iodone_wait[smp_processor_id()]); + } else { + up(&pb->pb_iodonesema); + } +} + +/* + * pagebuf_ioerror + * + * pagebuf_ioerror sets the error code for a buffer. + */ +void +pagebuf_ioerror( /* mark/clear buffer error flag */ + page_buf_t *pb, /* buffer to mark */ + unsigned int error) /* error to store (0 if none) */ +{ + pb->pb_error = error; + PB_TRACE(pb, PB_TRACE_REC(ioerror), error); +} + +/* + * pagebuf_iostart + * + * pagebuf_iostart initiates I/O on a buffer, based on the flags supplied. + * If necessary, it will arrange for any disk space allocation required, + * and it will break up the request if the block mappings require it. + * An pb_iodone routine in the buffer supplied will only be called + * when all of the subsidiary I/O requests, if any, have been completed. + * pagebuf_iostart calls the pagebuf_ioinitiate routine or + * pagebuf_iorequest, if the former routine is not defined, to start + * the I/O on a given low-level request. + */ +int +pagebuf_iostart( /* start I/O on a buffer */ + page_buf_t *pb, /* buffer to start */ + page_buf_flags_t flags) /* PBF_LOCK, PBF_ASYNC, PBF_READ, */ + /* PBF_WRITE, PBF_ALLOCATE, */ + /* PBF_DELWRI, */ + /* PBF_SYNC, PBF_DONT_BLOCK */ + /* PBF_RELEASE */ +{ + int status = 0; + + PB_TRACE(pb, PB_TRACE_REC(iostart), flags); + + if (flags & PBF_DELWRI) { + pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC); + pb->pb_flags |= flags & + (PBF_DELWRI | PBF_ASYNC | PBF_SYNC); + pagebuf_delwri_queue(pb, 1); + return status; + } + + pb->pb_flags &= ~(PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_DELWRI|PBF_READ_AHEAD); + pb->pb_flags |= flags & (PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_SYNC|PBF_READ_AHEAD); + + if (pb->pb_bn == PAGE_BUF_DADDR_NULL) { + BUG(); + } + + /* For writes call internal function which checks for + * filesystem specific callout function and execute it. + */ + if (flags & PBF_WRITE) { + status = __pagebuf_iorequest(pb); + } else { + status = pagebuf_iorequest(pb); + } + + /* Wait for I/O if we are not an async request */ + if ((status == 0) && (flags & PBF_ASYNC) == 0) { + status = pagebuf_iowait(pb); + } + + return status; +} + + +/* + * Helper routines for pagebuf_iorequest (pagebuf I/O completion) + * + * (different routines for locked/unlocked, and single/multi-bh pagebufs) + */ + +STATIC inline void +_pb_io_done( + page_buf_t *pb) +{ + if (atomic_dec_and_test(&PBP(pb)->pb_io_remaining) == 1) { + pb->pb_locked = 0; + pagebuf_iodone(pb); + } +} + +STATIC void +_end_pagebuf_page_io( + struct buffer_head *bh, + int uptodate, + int locked) +{ + struct page *page; + page_buf_t *pb = (page_buf_t *) bh->b_private; + + mark_buffer_uptodate(bh, uptodate); + atomic_dec(&bh->b_count); + + page = bh->b_page; + if (!test_bit(BH_Uptodate, &bh->b_state)) { + set_bit(PG_error, &page->flags); + pb->pb_error = EIO; + } + + unlock_buffer(bh); + _pagebuf_free_bh(bh); + + SetPageUptodate(page); + if (locked) + unlock_page(page); + _pb_io_done(pb); +} + +STATIC void +_end_io_locked( + struct buffer_head *bh, + int uptodate) +{ + _end_pagebuf_page_io(bh, uptodate, 1); +} + +STATIC void +_end_io_nolock( + struct buffer_head *bh, + int uptodate) +{ + _end_pagebuf_page_io(bh, uptodate, 0); +} + +typedef struct { + page_buf_t *pb; /* pointer to pagebuf page is within */ + int locking; /* are pages locked? */ + atomic_t remain; /* count of remaining I/O requests */ +} pagesync_t; + +STATIC void +_end_pagebuf_page_io_multi( + struct buffer_head *bh, + int uptodate, + int fullpage) +{ + pagesync_t *psync = (pagesync_t *) bh->b_private; + page_buf_t *pb = psync->pb; + struct page *page; + + mark_buffer_uptodate(bh, uptodate); + put_bh(bh); + + page = bh->b_page; + if (!test_bit(BH_Uptodate, &bh->b_state)) { + set_bit(PG_error, &page->flags); + pb->pb_error = EIO; + } + + unlock_buffer(bh); + if (fullpage) + _pagebuf_free_bh(bh); + + if (atomic_dec_and_test(&psync->remain) == 1) { + if (fullpage) + SetPageUptodate(page); + if (psync->locking) + unlock_page(page); + kfree(psync); + _pb_io_done(pb); + } +} + +STATIC void +_end_io_multi_full( + struct buffer_head *bh, + int uptodate) +{ + _end_pagebuf_page_io_multi(bh, uptodate, 1); +} + +STATIC void +_end_io_multi_part( + struct buffer_head *bh, + int uptodate) +{ + _end_pagebuf_page_io_multi(bh, uptodate, 0); +} + + +/* + * Initiate I/O on part of a page we are interested in + */ +STATIC int +_pagebuf_page_io( + struct page *page, /* Page structure we are dealing with */ + page_buf_t *pb, /* pagebuf holding it, can be NULL */ + page_buf_daddr_t bn, /* starting block number */ + kdev_t dev, /* device for I/O */ + size_t blocksize, /* filesystem block size */ + off_t pg_offset, /* starting offset in page */ + size_t pg_length, /* count of data to process */ + int locking, /* page locking in use */ + int rw, /* read/write operation */ + int flush) +{ + size_t sector; + size_t blk_length = 0; + struct buffer_head *bh, *head, *bufferlist[MAX_BUF_PER_PAGE]; + int multi_ok; + int i = 0, cnt = 0, err = 0; + int public_bh = 0; + + if ((blocksize < PAGE_CACHE_SIZE) && + !(pb->pb_flags & _PBF_PRIVATE_BH)) { + int cache_ok; + + cache_ok = !((pb->pb_flags & PBF_FORCEIO) || (rw == WRITE)); + public_bh = multi_ok = 1; + + if (!page_has_buffers(page)) { + if (!locking) { + lock_page(page); + if (!page_has_buffers(page)) { + create_empty_buffers(page, dev, + SECTOR_SIZE); + } + unlock_page(page); + } else { + create_empty_buffers(page, dev, SECTOR_SIZE); + } + } + + /* Find buffer_heads belonging to just this pagebuf */ + bh = head = page_buffers(page); + do { + if (buffer_uptodate(bh) && cache_ok) + continue; + blk_length = i << SECTOR_SHIFT; + if (blk_length < pg_offset) + continue; + if (blk_length >= pg_offset + pg_length) + break; + + lock_buffer(bh); + get_bh(bh); + assert(!waitqueue_active(&bh->b_wait)); + + bh->b_size = SECTOR_SIZE; + bh->b_blocknr = bn + (i - (pg_offset >> SECTOR_SHIFT)); + bufferlist[cnt++] = bh; + } while (i++, (bh = bh->b_this_page) != head); + + goto request; + } + + /* Calculate the block offsets and length we will be using */ + if (pg_offset) { + size_t block_offset; + + block_offset = pg_offset >> SECTOR_SHIFT; + block_offset = pg_offset - (block_offset << SECTOR_SHIFT); + blk_length = (pg_length + block_offset + SECTOR_MASK) >> + SECTOR_SHIFT; + } else { + blk_length = (pg_length + SECTOR_MASK) >> SECTOR_SHIFT; + } + + /* This will attempt to make a request bigger than the sector + * size if we are well aligned. + */ + switch (pb->pb_target->pbr_flags) { + case 0: + sector = blk_length << SECTOR_SHIFT; + blk_length = 1; + break; + case PBR_ALIGNED_ONLY: + if ((pg_offset == 0) && (pg_length == PAGE_CACHE_SIZE) && + (((unsigned int) bn) & BN_ALIGN_MASK) == 0) { + sector = blk_length << SECTOR_SHIFT; + blk_length = 1; + break; + } + case PBR_SECTOR_ONLY: + /* Fallthrough, same as default */ + default: + sector = SECTOR_SIZE; + } + + /* The b_size field of struct buffer_head is an unsigned short + * ... we may need to split this request up. [64K is too big] + */ + assert(sizeof(bh->b_size) == 2); + while (sector > 0xffff) { + sector >>= 1; + blk_length++; + } + + multi_ok = (blk_length != 1); + + for (; blk_length > 0; blk_length--, pg_offset += sector) { + bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS); + if (!bh) { + bh = _pagebuf_get_prealloc_bh(); + if (!bh) { + /* This should never happen */ + err = -ENOMEM; + goto error; + } + } + memset(bh, 0, sizeof(*bh)); + bh->b_size = sector; + bh->b_blocknr = bn++; + bh->b_dev = dev; + set_bit(BH_Lock, &bh->b_state); + set_bh_page(bh, page, pg_offset); + init_waitqueue_head(&bh->b_wait); + atomic_set(&bh->b_count, 1); + bufferlist[cnt++] = bh; + } + +request: + if (cnt) { + pagesync_t *psync = NULL; + void (*callback)(struct buffer_head *, int); + + if (multi_ok) { + size_t size = sizeof(pagesync_t); + + psync = (pagesync_t *) kmalloc(size, GFP_NOFS); + if (!psync) + BUG(); /* Ugh - out of memory condition here */ + psync->pb = pb; + psync->locking = locking; + atomic_set(&psync->remain, 0); + + callback = public_bh ? + _end_io_multi_part : _end_io_multi_full; + } else { + callback = locking ? _end_io_locked : _end_io_nolock; + } + + /* Indicate that there is another page in progress */ + atomic_inc(&PBP(pb)->pb_io_remaining); + +#ifdef RQ_WRITE_ORDERED + if (flush) + set_bit(BH_Ordered_Flush, &bufferlist[cnt-1]->b_state); +#endif + + for (i = 0; i < cnt; i++) { + bh = bufferlist[i]; + + /* Complete the buffer_head, then submit the IO */ + if (psync) { + init_buffer(bh, callback, psync); + atomic_inc(&psync->remain); + } else { + init_buffer(bh, callback, pb); + } + + bh->b_rdev = bh->b_dev; + bh->b_rsector = bh->b_blocknr; + set_bit(BH_Mapped, &bh->b_state); + set_bit(BH_Req, &bh->b_state); + + if (rw == WRITE) { + set_bit(BH_Uptodate, &bh->b_state); + } + generic_make_request(rw, bh); + } + } else { + if (locking) + unlock_page(page); + } + + return err; +error: + /* If we ever do get here then clean up what we already did */ + for (i = 0; i < cnt; i++) { + atomic_set_buffer_clean(bufferlist[i]); + bufferlist[i]->b_end_io(bufferlist[i], 0); + } + return err; +} + +STATIC int +_page_buf_page_apply( + page_buf_t *pb, + loff_t offset, + struct page *page, + size_t pg_offset, + size_t pg_length, + int last) +{ + page_buf_daddr_t bn = pb->pb_bn; + kdev_t dev = pb->pb_target->pbr_kdev; + size_t blocksize = pb->pb_target->pbr_blocksize; + loff_t pb_offset; + size_t ret_len = pg_length; + + assert(page); + + if ((blocksize == PAGE_CACHE_SIZE) && + (pb->pb_buffer_length < PAGE_CACHE_SIZE) && + (pb->pb_flags & PBF_READ) && pb->pb_locked) { + bn -= (pb->pb_offset >> SECTOR_SHIFT); + pg_offset = 0; + pg_length = PAGE_CACHE_SIZE; + } else { + pb_offset = offset - pb->pb_file_offset; + if (pb_offset) { + bn += (pb_offset + SECTOR_MASK) >> SECTOR_SHIFT; + } + } + + if (pb->pb_flags & PBF_READ) { + _pagebuf_page_io(page, pb, bn, dev, blocksize, + (off_t)pg_offset, pg_length, pb->pb_locked, READ, 0); + } else if (pb->pb_flags & PBF_WRITE) { + int locking = (pb->pb_flags & _PBF_LOCKABLE) == 0; + + /* Check we need to lock pages */ + if (locking && (pb->pb_locked == 0)) + lock_page(page); + _pagebuf_page_io(page, pb, bn, dev, blocksize, + (off_t)pg_offset, pg_length, locking, WRITE, + last && (pb->pb_flags & PBF_FLUSH)); + } + + return ret_len; +} + +/* + * pagebuf_iorequest + * + * pagebuf_iorequest is the core I/O request routine. + * It assumes that the buffer is well-formed and + * mapped and ready for physical I/O, unlike + * pagebuf_iostart() and pagebuf_iophysio(). Those + * routines call the pagebuf_ioinitiate routine to start I/O, + * if it is present, or else call pagebuf_iorequest() + * directly if the pagebuf_ioinitiate routine is not present. + * + * This function will be responsible for ensuring access to the + * pages is restricted whilst I/O is in progress - for locking + * pagebufs the pagebuf lock is the mediator, for non-locking + * pagebufs the pages will be locked. In the locking case we + * need to use the pagebuf lock as multiple meta-data buffers + * will reference the same page. + */ +int +pagebuf_iorequest( /* start real I/O */ + page_buf_t *pb) /* buffer to convey to device */ +{ + int status = 0; + + PB_TRACE(pb, PB_TRACE_REC(ioreq), 0); + + if (pb->pb_flags & PBF_DELWRI) { + pagebuf_delwri_queue(pb, 1); + return status; + } + + if (pb->pb_flags & PBF_WRITE) { + _pagebuf_wait_unpin(pb); + } + + /* Set the count to 1 initially, this will stop an I/O + * completion callout which happens before we have started + * all the I/O from calling iodone too early + */ + atomic_set(&PBP(pb)->pb_io_remaining, 1); + status = _pagebuf_segment_apply(pb); + + /* Drop our count and if everything worked we are done */ + if (atomic_dec_and_test(&PBP(pb)->pb_io_remaining) == 1) { + pagebuf_iodone(pb); + } else if ((pb->pb_flags & (PBF_SYNC|PBF_ASYNC)) == PBF_SYNC) { + run_task_queue(&tq_disk); + } + + return status < 0 ? status : 0; +} + +/* + * pagebuf_iowait + * + * pagebuf_iowait waits for I/O to complete on the buffer supplied. + * It returns immediately if no I/O is pending. In any case, it returns + * the error code, if any, or 0 if there is no error. + */ +int +pagebuf_iowait( + page_buf_t *pb) +{ + PB_TRACE(pb, PB_TRACE_REC(iowait), 0); + run_task_queue(&tq_disk); + down(&pb->pb_iodonesema); + PB_TRACE(pb, PB_TRACE_REC(iowaited), (int)pb->pb_error); + return pb->pb_error; +} + +STATIC void * +pagebuf_mapout_locked( + page_buf_t *pb) +{ + void *old_addr = NULL; + + if (pb->pb_flags & PBF_MAPPED) { + if (pb->pb_flags & _PBF_ADDR_ALLOCATED) + old_addr = pb->pb_addr - pb->pb_offset; + pb->pb_addr = NULL; + pb->pb_flags &= ~(PBF_MAPPED | _PBF_ADDR_ALLOCATED); + } + + return old_addr; /* Caller must free the address space, + * we are under a spin lock, probably + * not safe to do vfree here + */ +} + +caddr_t +pagebuf_offset( + page_buf_t *pb, + off_t offset) +{ + struct page *page; + + offset += pb->pb_offset; + + page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT]; + return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1)); +} + +/* + * pagebuf_segment + * + * pagebuf_segment is used to retrieve the various contiguous + * segments of a buffer. The variable addressed by the + * loff_t * should be initialized to 0, and successive + * calls will update to point to the segment following the one + * returned. + */ +STATIC void +pagebuf_segment( + page_buf_t *pb, /* buffer to examine */ + loff_t *boff_p,/* offset in buffer of next */ + /* next segment (updated) */ + struct page **spage_p, /* page (updated) */ + /* (NULL if not in page array) */ + size_t *soff_p,/* offset in page (updated) */ + size_t *ssize_p) /* segment length (updated) */ +{ + loff_t kpboff; /* offset in pagebuf */ + int kpi; /* page index in pagebuf */ + size_t slen; /* segment length */ + + kpboff = *boff_p; + + kpi = page_buf_btoct(kpboff + pb->pb_offset); + + *spage_p = pb->pb_pages[kpi]; + + *soff_p = page_buf_poff(kpboff + pb->pb_offset); + slen = PAGE_CACHE_SIZE - *soff_p; + if (slen > (pb->pb_count_desired - kpboff)) + slen = (pb->pb_count_desired - kpboff); + *ssize_p = slen; + + *boff_p = *boff_p + slen; +} + +/* + * pagebuf_iomove + * + * Move data into or out of a buffer. + */ +void +pagebuf_iomove( + page_buf_t *pb, /* buffer to process */ + off_t boff, /* starting buffer offset */ + size_t bsize, /* length to copy */ + caddr_t data, /* data address */ + page_buf_rw_t mode) /* read/write flag */ +{ + loff_t cboff; + size_t cpoff; + size_t csize; + struct page *page; + + cboff = boff; + boff += bsize; /* last */ + + while (cboff < boff) { + pagebuf_segment(pb, &cboff, &page, &cpoff, &csize); + assert(((csize + cpoff) <= PAGE_CACHE_SIZE)); + + switch (mode) { + case PBRW_ZERO: + memset(page_address(page) + cpoff, 0, csize); + break; + case PBRW_READ: + memcpy(data, page_address(page) + cpoff, csize); + break; + case PBRW_WRITE: + memcpy(page_address(page) + cpoff, data, csize); + } + + data += csize; + } +} + +/* + * _pagebuf_segment_apply + * + * Applies _page_buf_page_apply to each segment of the page_buf_t. + */ +STATIC int +_pagebuf_segment_apply( /* apply function to segments */ + page_buf_t *pb) /* buffer to examine */ +{ + int buf_index, sval, status = 0; + loff_t buffer_offset = pb->pb_file_offset; + size_t buffer_len = pb->pb_count_desired; + size_t page_offset, len, total = 0; + size_t cur_offset, cur_len; + + pagebuf_hold(pb); + + cur_offset = pb->pb_offset; + cur_len = buffer_len; + + for (buf_index = 0; buf_index < pb->pb_page_count; buf_index++) { + if (cur_len == 0) + break; + if (cur_offset >= PAGE_CACHE_SIZE) { + cur_offset -= PAGE_CACHE_SIZE; + continue; + } + + page_offset = cur_offset; + cur_offset = 0; + + len = PAGE_CACHE_SIZE - page_offset; + if (len > cur_len) + len = cur_len; + cur_len -= len; + + sval = _page_buf_page_apply(pb, buffer_offset, + pb->pb_pages[buf_index], page_offset, len, + buf_index+1 == pb->pb_page_count); + if (sval <= 0) { + status = sval; + break; + } else { + len = sval; + total += len; + } + + buffer_offset += len; + buffer_len -= len; + } + + pagebuf_rele(pb); + + if (!status) + status = total; + + return (status); +} + + +/* + * Pagebuf delayed write buffer handling + */ + +void +pagebuf_delwri_queue( + page_buf_t *pb, + int unlock) +{ + PB_TRACE(pb, PB_TRACE_REC(delwri_q), unlock); + spin_lock(&pb_daemon->pb_delwrite_lock); + /* If already in the queue, dequeue and place at tail */ + if (!list_empty(&pb->pb_list)) { + if (unlock) { + atomic_dec(&pb->pb_hold); + } + list_del(&pb->pb_list); + } else { + pb_daemon->pb_delwri_cnt++; + } + list_add_tail(&pb->pb_list, &pb_daemon->pb_delwrite_l); + PBP(pb)->pb_flushtime = jiffies + pb_params.p_un.age_buffer; + spin_unlock(&pb_daemon->pb_delwrite_lock); + + if (unlock && (pb->pb_flags & _PBF_LOCKABLE)) { + pagebuf_unlock(pb); + } +} + +void +pagebuf_delwri_dequeue( + page_buf_t *pb) +{ + PB_TRACE(pb, PB_TRACE_REC(delwri_uq), 0); + spin_lock(&pb_daemon->pb_delwrite_lock); + list_del_init(&pb->pb_list); + pb->pb_flags &= ~PBF_DELWRI; + pb_daemon->pb_delwri_cnt--; + spin_unlock(&pb_daemon->pb_delwrite_lock); +} + + +/* + * The pagebuf iodone daemon + */ + +STATIC int pb_daemons[NR_CPUS]; + +STATIC int +pagebuf_iodone_daemon( + void *__bind_cpu) +{ + int bind_cpu = (int) (long) __bind_cpu; + int cpu = cpu_logical_map(bind_cpu); + DECLARE_WAITQUEUE (wait, current); + + /* Set up the thread */ + daemonize(); + + /* Avoid signals */ + spin_lock_irq(¤t->sigmask_lock); + sigfillset(¤t->blocked); + recalc_sigpending(current); + spin_unlock_irq(¤t->sigmask_lock); + + /* Migrate to the right CPU */ + current->cpus_allowed = 1UL << cpu; + while (smp_processor_id() != cpu) + schedule(); + + sprintf(current->comm, "pagebuf_io_CPU%d", bind_cpu); + INIT_LIST_HEAD(&pagebuf_iodone_tq[cpu]); + init_waitqueue_head(&pagebuf_iodone_wait[cpu]); + __set_current_state(TASK_INTERRUPTIBLE); + mb(); + + pb_daemons[cpu] = 1; + + for (;;) { + add_wait_queue(&pagebuf_iodone_wait[cpu], + &wait); + + if (TQ_ACTIVE(pagebuf_iodone_tq[cpu])) + __set_task_state(current, TASK_RUNNING); + schedule(); + remove_wait_queue(&pagebuf_iodone_wait[cpu], + &wait); + run_task_queue(&pagebuf_iodone_tq[cpu]); + if (pb_daemons[cpu] == 0) + break; + __set_current_state(TASK_INTERRUPTIBLE); + } + + pb_daemons[cpu] = -1; + wake_up_interruptible(&pagebuf_iodone_wait[cpu]); + return 0; +} + +/* Defines for pagebuf daemon */ +DECLARE_WAIT_QUEUE_HEAD(pbd_waitq); +STATIC int force_flush; + +STATIC void +pagebuf_daemon_wakeup( + int flag) +{ + force_flush = flag; + if (waitqueue_active(&pbd_waitq)) { + wake_up_interruptible(&pbd_waitq); + } +} + +typedef void (*timeout_fn)(unsigned long); + +STATIC int +pagebuf_daemon( + void *data) +{ + int count; + page_buf_t *pb; + struct list_head *curr, *next, tmp; + struct timer_list pb_daemon_timer = + { {NULL, NULL}, 0, 0, (timeout_fn)pagebuf_daemon_wakeup }; + + /* Set up the thread */ + daemonize(); + + /* Avoid signals */ + spin_lock_irq(¤t->sigmask_lock); + sigfillset(¤t->blocked); + recalc_sigpending(current); + spin_unlock_irq(¤t->sigmask_lock); + + strcpy(current->comm, "pagebufd"); + current->flags |= PF_MEMALLOC; + + INIT_LIST_HEAD(&tmp); + do { + if (pb_daemon->active == 1) { + del_timer(&pb_daemon_timer); + pb_daemon_timer.expires = jiffies + + pb_params.p_un.flush_interval; + add_timer(&pb_daemon_timer); + interruptible_sleep_on(&pbd_waitq); + } + + if (pb_daemon->active == 0) { + del_timer(&pb_daemon_timer); + } + + spin_lock(&pb_daemon->pb_delwrite_lock); + + count = 0; + list_for_each_safe(curr, next, &pb_daemon->pb_delwrite_l) { + pb = list_entry(curr, page_buf_t, pb_list); + + PB_TRACE(pb, PB_TRACE_REC(walkq1), pagebuf_ispin(pb)); + + if ((pb->pb_flags & PBF_DELWRI) && !pagebuf_ispin(pb) && + (((pb->pb_flags & _PBF_LOCKABLE) == 0) || + !pagebuf_cond_lock(pb))) { + + if (!force_flush && time_before(jiffies, + PBP(pb)->pb_flushtime)) { + pagebuf_unlock(pb); + break; + } + + list_del(&pb->pb_list); + list_add(&pb->pb_list, &tmp); + + count++; + } + } + + spin_unlock(&pb_daemon->pb_delwrite_lock); + while (!list_empty(&tmp)) { + pb = list_entry(tmp.next, + page_buf_t, pb_list); + list_del_init(&pb->pb_list); + pb->pb_flags &= ~PBF_DELWRI; + pb->pb_flags |= PBF_WRITE; + + __pagebuf_iorequest(pb); + } + + if (count) + run_task_queue(&tq_disk); + if (as_list_len > 0) + purge_addresses(); + + force_flush = 0; + } while (pb_daemon->active == 1); + + pb_daemon->active = -1; + wake_up_interruptible(&pbd_waitq); + + return 0; +} + +void +pagebuf_delwri_flush( + pb_target_t *target, + u_long flags, + int *pinptr) +{ + page_buf_t *pb; + struct list_head *curr, *next, tmp; + int pincount = 0; + + spin_lock(&pb_daemon->pb_delwrite_lock); + INIT_LIST_HEAD(&tmp); + + list_for_each_safe(curr, next, &pb_daemon->pb_delwrite_l) { + pb = list_entry(curr, page_buf_t, pb_list); + + /* + * Skip other targets, markers and in progress buffers + */ + + if ((pb->pb_flags == 0) || (pb->pb_target != target) || + !(pb->pb_flags & PBF_DELWRI)) { + continue; + } + + PB_TRACE(pb, PB_TRACE_REC(walkq2), pagebuf_ispin(pb)); + if (pagebuf_ispin(pb)) { + pincount++; + continue; + } + + if (flags & PBDF_TRYLOCK) { + if (!pagebuf_cond_lock(pb)) { + pincount++; + continue; + } + } + + list_del_init(&pb->pb_list); + if (flags & PBDF_WAIT) { + list_add(&pb->pb_list, &tmp); + pb->pb_flags &= ~PBF_ASYNC; + } + + spin_unlock(&pb_daemon->pb_delwrite_lock); + + if ((flags & PBDF_TRYLOCK) == 0) { + pagebuf_lock(pb); + } + + pb->pb_flags &= ~PBF_DELWRI; + pb->pb_flags |= PBF_WRITE; + + __pagebuf_iorequest(pb); + + spin_lock(&pb_daemon->pb_delwrite_lock); + } + + spin_unlock(&pb_daemon->pb_delwrite_lock); + + run_task_queue(&tq_disk); + + if (pinptr) + *pinptr = pincount; + + if ((flags & PBDF_WAIT) == 0) + return; + + while (!list_empty(&tmp)) { + pb = list_entry(tmp.next, page_buf_t, pb_list); + + list_del_init(&pb->pb_list); + pagebuf_iowait(pb); + if (!pb->pb_relse) + pagebuf_unlock(pb); + pagebuf_rele(pb); + } +} + +STATIC int +pagebuf_daemon_start(void) +{ + if (!pb_daemon) { + int cpu; + + pb_daemon = (pagebuf_daemon_t *) + kmalloc(sizeof(pagebuf_daemon_t), GFP_KERNEL); + if (!pb_daemon) { + return -1; /* error */ + } + + pb_daemon->active = 1; + pb_daemon->io_active = 1; + pb_daemon->pb_delwri_cnt = 0; + pb_daemon->pb_delwrite_lock = SPIN_LOCK_UNLOCKED; + + INIT_LIST_HEAD(&pb_daemon->pb_delwrite_l); + + kernel_thread(pagebuf_daemon, (void *)pb_daemon, + CLONE_FS|CLONE_FILES|CLONE_VM); + for (cpu = 0; cpu < smp_num_cpus; cpu++) { + if (kernel_thread(pagebuf_iodone_daemon, + (void *)(long) cpu, + CLONE_FS|CLONE_FILES|CLONE_VM) < 0) { + printk("pagebuf_daemon_start failed\n"); + } else { + while (!pb_daemons[cpu_logical_map(cpu)]) { + current->policy |= SCHED_YIELD; + schedule(); + } + } + } + } + return 0; +} + +/* + * pagebuf_daemon_stop + * + * Note: do not mark as __exit, it is called from pagebuf_terminate. + */ +STATIC void +pagebuf_daemon_stop(void) +{ + if (pb_daemon) { + int cpu; + + pb_daemon->active = 0; + pb_daemon->io_active = 0; + + wake_up_interruptible(&pbd_waitq); + while (pb_daemon->active == 0) { + interruptible_sleep_on(&pbd_waitq); + } + for (cpu = 0; cpu < smp_num_cpus; cpu++) { + pb_daemons[cpu_logical_map(cpu)] = 0; + wake_up(&pagebuf_iodone_wait[cpu_logical_map(cpu)]); + while (pb_daemons[cpu_logical_map(cpu)] != -1) { + interruptible_sleep_on( + &pagebuf_iodone_wait[cpu_logical_map(cpu)]); + } + } + + kfree(pb_daemon); + pb_daemon = NULL; + } +} + + +/* + * Pagebuf sysctl interface + */ + +STATIC int +pb_stats_clear_handler( + ctl_table *ctl, + int write, + struct file *filp, + void *buffer, + size_t *lenp) +{ + int ret; + int *valp = ctl->data; + + ret = proc_doulongvec_minmax(ctl, write, filp, buffer, lenp); + + if (!ret && write && *valp) { + printk("XFS Clearing pbstats\n"); + memset(&pbstats, 0, sizeof(pbstats)); + pb_params.p_un.stats_clear = 0; + } + + return ret; +} + +STATIC struct ctl_table_header *pagebuf_table_header; + +STATIC ctl_table pagebuf_table[] = { + {PB_FLUSH_INT, "flush_int", &pb_params.data[0], + sizeof(ulong), 0644, NULL, &proc_doulongvec_ms_jiffies_minmax, + &sysctl_intvec, NULL, &pagebuf_min[0], &pagebuf_max[0]}, + + {PB_FLUSH_AGE, "flush_age", &pb_params.data[1], + sizeof(ulong), 0644, NULL, &proc_doulongvec_ms_jiffies_minmax, + &sysctl_intvec, NULL, &pagebuf_min[1], &pagebuf_max[1]}, + + {PB_STATS_CLEAR, "stats_clear", &pb_params.data[3], + sizeof(ulong), 0644, NULL, &pb_stats_clear_handler, + &sysctl_intvec, NULL, &pagebuf_min[3], &pagebuf_max[3]}, + +#ifdef PAGEBUF_TRACE + {PB_DEBUG, "debug", &pb_params.data[4], + sizeof(ulong), 0644, NULL, &proc_doulongvec_minmax, + &sysctl_intvec, NULL, &pagebuf_min[4], &pagebuf_max[4]}, +#endif + {0} +}; + +STATIC ctl_table pagebuf_dir_table[] = { + {VM_PAGEBUF, "pagebuf", NULL, 0, 0555, pagebuf_table}, + {0} +}; + +STATIC ctl_table pagebuf_root_table[] = { + {CTL_VM, "vm", NULL, 0, 0555, pagebuf_dir_table}, + {0} +}; + +#ifdef CONFIG_PROC_FS +STATIC int +pagebuf_readstats( + char *buffer, + char **start, + off_t offset, + int count, + int *eof, + void *data) +{ + int i, len; + + len = 0; + len += sprintf(buffer + len, "pagebuf"); + for (i = 0; i < sizeof(pbstats) / sizeof(u_int32_t); i++) { + len += sprintf(buffer + len, " %u", + *(((u_int32_t*)&pbstats) + i)); + } + buffer[len++] = '\n'; + + if (offset >= len) { + *start = buffer; + *eof = 1; + return 0; + } + *start = buffer + offset; + if ((len -= offset) > count) + return count; + *eof = 1; + + return len; +} +#endif /* CONFIG_PROC_FS */ + +STATIC void +pagebuf_shaker(void) +{ + pagebuf_daemon_wakeup(1); +} + + +/* + * Initialization and Termination + */ + +int __init +pagebuf_init(void) +{ + int i; + + pagebuf_table_header = register_sysctl_table(pagebuf_root_table, 1); + +#ifdef CONFIG_PROC_FS + if (proc_mkdir("fs/pagebuf", 0)) + create_proc_read_entry( + "fs/pagebuf/stat", 0, 0, pagebuf_readstats, NULL); +#endif + + pagebuf_cache = kmem_cache_create("page_buf_t", + sizeof(page_buf_private_t), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (pagebuf_cache == NULL) { + printk("pagebuf: couldn't init pagebuf cache\n"); + pagebuf_terminate(); + return -ENOMEM; + } + + if (_pagebuf_prealloc_bh(NR_RESERVED_BH) < NR_RESERVED_BH) { + printk("pagebuf: couldn't pre-allocate %d buffer heads\n", + NR_RESERVED_BH); + pagebuf_terminate(); + return -ENOMEM; + } + + init_waitqueue_head(&pb_resv_bh_wait); + + for (i = 0; i < NHASH; i++) { + spin_lock_init(&pbhash[i].pb_hash_lock); + INIT_LIST_HEAD(&pbhash[i].pb_hash); + } + +#ifdef PAGEBUF_TRACE +# if 1 + pb_trace.buf = (pagebuf_trace_t *)kmalloc( + PB_TRACE_BUFSIZE * sizeof(pagebuf_trace_t), GFP_KERNEL); +# else + /* Alternatively, for really really long trace bufs */ + pb_trace.buf = (pagebuf_trace_t *)vmalloc( + PB_TRACE_BUFSIZE * sizeof(pagebuf_trace_t)); +# endif + memset(pb_trace.buf, 0, PB_TRACE_BUFSIZE * sizeof(pagebuf_trace_t)); + pb_trace.start = 0; + pb_trace.end = PB_TRACE_BUFSIZE - 1; +#endif + + pagebuf_daemon_start(); + kmem_shake_register(pagebuf_shaker); + return 0; +} + +/* + * pagebuf_terminate. + * + * Note: do not mark as __exit, this is also called from the __init code. + */ +void +pagebuf_terminate(void) +{ + pagebuf_daemon_stop(); + + kmem_cache_destroy(pagebuf_cache); + kmem_shake_deregister(pagebuf_shaker); + + unregister_sysctl_table(pagebuf_table_header); +#ifdef CONFIG_PROC_FS + remove_proc_entry("fs/pagebuf/stat", NULL); + remove_proc_entry("fs/pagebuf", NULL); +#endif +} + + +/* + * Module management (for kernel debugger module) + */ +EXPORT_SYMBOL(pagebuf_offset); diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/pagebuf/page_buf.h linux-2.4-xfs/fs/xfs/pagebuf/page_buf.h --- linux-2.4.19/fs/xfs/pagebuf/page_buf.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/pagebuf/page_buf.h Wed Sep 4 22:35:19 2002 @@ -0,0 +1,405 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * Written by Steve Lord, Jim Mostek, Russell Cattelan at SGI + */ + +#ifndef __PAGE_BUF_H__ +#define __PAGE_BUF_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* + * Turn this on to get pagebuf lock ownership +#define PAGEBUF_LOCK_TRACKING +*/ + +/* + * Base types + */ + +/* daddr must be signed since -1 is used for bmaps that are not yet allocated */ +typedef loff_t page_buf_daddr_t; + +#define PAGE_BUF_DADDR_NULL ((page_buf_daddr_t) (-1LL)) + +typedef size_t page_buf_dsize_t; /* size of buffer in blocks */ + +#define page_buf_ctob(pp) ((pp) * PAGE_CACHE_SIZE) +#define page_buf_btoc(dd) (((dd) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) +#define page_buf_btoct(dd) ((dd) >> PAGE_CACHE_SHIFT) +#define page_buf_poff(aa) ((aa) & ~PAGE_CACHE_MASK) + +typedef enum page_buf_rw_e { + PBRW_READ = 1, /* transfer into target memory */ + PBRW_WRITE = 2, /* transfer from target memory */ + PBRW_ZERO = 3 /* Zero target memory */ +} page_buf_rw_t; + +typedef enum { /* pbm_flags values */ + PBMF_EOF = 0x01, /* mapping contains EOF */ + PBMF_HOLE = 0x02, /* mapping covers a hole */ + PBMF_DELAY = 0x04, /* mapping covers delalloc region */ + PBMF_UNWRITTEN = 0x20 /* mapping covers allocated */ + /* but uninitialized XFS data */ +} bmap_flags_t; + +typedef enum page_buf_flags_e { /* pb_flags values */ + PBF_READ = (1 << 0), /* buffer intended for reading from device */ + PBF_WRITE = (1 << 1), /* buffer intended for writing to device */ + PBF_MAPPED = (1 << 2), /* buffer mapped (pb_addr valid) */ + PBF_PARTIAL = (1 << 3), /* buffer partially read */ + PBF_ASYNC = (1 << 4), /* initiator will not wait for completion */ + PBF_NONE = (1 << 5), /* buffer not read at all */ + PBF_DELWRI = (1 << 6), /* buffer has dirty pages */ + PBF_FREED = (1 << 7), /* buffer has been freed and is invalid */ + PBF_SYNC = (1 << 8), /* force updates to disk */ + PBF_MAPPABLE = (1 << 9),/* use directly-addressable pages */ + PBF_STALE = (1 << 10), /* buffer has been staled, do not find it */ + PBF_FS_MANAGED = (1 << 11), /* filesystem controls freeing memory */ + PBF_RELEASE = (1 << 12),/* buffer to be released after I/O is done */ + + /* flags used only as arguments to access routines */ + PBF_LOCK = (1 << 13), /* lock requested */ + PBF_TRYLOCK = (1 << 14), /* lock requested, but do not wait */ + PBF_ALLOCATE = (1 << 15), /* allocate all pages (UNUSED) */ + PBF_FILE_ALLOCATE = (1 << 16), /* allocate all file space */ + PBF_DONT_BLOCK = (1 << 17), /* do not block in current thread */ + PBF_DIRECT = (1 << 18), /* direct I/O desired */ + PBF_ENTER_PAGES = (1 << 21), /* create invalid pages for all */ + /* pages in the range of the buffer */ + /* not already associated with buffer */ + + /* flags used only internally */ + _PBF_LOCKABLE = (1 << 19), /* page_buf_t may be locked */ + _PBF_PRIVATE_BH = (1 << 20), /* do not use public buffer heads */ + _PBF_ALL_PAGES_MAPPED = (1 << 22), + /* all pages in rage are mapped */ + _PBF_SOME_INVALID_PAGES = (1 << 23), + /* some mapped pages are not valid */ + _PBF_ADDR_ALLOCATED = (1 << 24), + /* pb_addr space was allocated */ + _PBF_MEM_ALLOCATED = (1 << 25), + /* pb_mem and underlying pages allocated */ + + PBF_FORCEIO = (1 << 27), + PBF_FLUSH = (1 << 28), /* flush disk write cache */ + PBF_READ_AHEAD = (1 << 29), + PBF_FS_RESERVED_3 = (1 << 31) /* reserved (XFS use: XFS_B_STALE) */ + +} page_buf_flags_t; + +#define PBF_UPDATE (PBF_READ | PBF_WRITE) +#define PBF_NOT_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) != 0) +#define PBF_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) == 0) + +#define PBR_SECTOR_ONLY 1 /* only use sector size buffer heads */ +#define PBR_ALIGNED_ONLY 2 /* only use aligned I/O */ + +typedef struct pb_target { + int pbr_flags; + dev_t pbr_dev; + kdev_t pbr_kdev; + struct block_device *pbr_bdev; + struct address_space *pbr_mapping; + unsigned int pbr_blocksize; + unsigned int pbr_blocksize_bits; +} pb_target_t; + +/* + * page_buf_bmap_t: File system I/O map + * + * The pbm_bn, pbm_offset and pbm_length fields are expressed in disk blocks. + * The pbm_length field specifies the size of the underlying backing store + * for the particular mapping. + * + * The pbm_bsize, pbm_size and pbm_delta fields are in bytes and indicate + * the size of the mapping, the number of bytes that are valid to access + * (read or write), and the offset into the mapping, given the offset + * supplied to the file I/O map routine. pbm_delta is the offset of the + * desired data from the beginning of the mapping. + * + * When a request is made to read beyond the logical end of the object, + * pbm_size may be set to 0, but pbm_offset and pbm_length should be set to + * the actual amount of underlying storage that has been allocated, if any. + */ + +typedef struct page_buf_bmap_s { + page_buf_daddr_t pbm_bn; /* block number in file system */ + pb_target_t *pbm_target; /* device to do I/O to */ + loff_t pbm_offset; /* byte offset of mapping in file */ + size_t pbm_delta; /* offset of request into bmap */ + size_t pbm_bsize; /* size of this mapping in bytes */ + bmap_flags_t pbm_flags; /* options flags for mapping */ +} page_buf_bmap_t; + +typedef page_buf_bmap_t pb_bmap_t; + + +/* + * page_buf_t: Buffer structure for page cache-based buffers + * + * This buffer structure is used by the page cache buffer management routines + * to refer to an assembly of pages forming a logical buffer. The actual + * I/O is performed with buffer_head or bio structures, as required by drivers, + * for drivers which do not understand this structure. The buffer structure is + * used on temporary basis only, and discarded when released. + * + * The real data storage is recorded in the page cache. Metadata is + * hashed to the inode for the block device on which the file system resides. + * File data is hashed to the inode for the file. Pages which are only + * partially filled with data have bits set in their block_map entry + * to indicate which disk blocks in the page are not valid. + */ + +struct page_buf_s; +typedef void (*page_buf_iodone_t)(struct page_buf_s *); + /* call-back function on I/O completion */ +typedef void (*page_buf_relse_t)(struct page_buf_s *); + /* call-back function on I/O completion */ +typedef int (*page_buf_bdstrat_t)(struct page_buf_s *); + +#define PB_PAGES 4 + +typedef struct page_buf_s { + struct list_head pb_list; + page_buf_flags_t pb_flags; /* status flags */ + struct list_head pb_hash_list; + struct pb_target *pb_target; /* logical object */ + atomic_t pb_hold; /* reference count */ + page_buf_daddr_t pb_bn; /* block number for I/O */ + loff_t pb_file_offset; /* offset in file */ + size_t pb_buffer_length; /* size of buffer in bytes */ + size_t pb_count_desired; /* desired transfer size */ + void *pb_addr; /* virtual address of buffer */ + struct tq_struct pb_iodone_sched; + page_buf_iodone_t pb_iodone; /* I/O completion function */ + page_buf_relse_t pb_relse; /* releasing function */ + page_buf_bdstrat_t pb_strat; /* pre-write function */ + struct semaphore pb_iodonesema; /* Semaphore for I/O waiters */ + void *pb_fspriv; + void *pb_fspriv2; + void *pb_fspriv3; + unsigned short pb_error; /* error code on I/O */ + unsigned short pb_page_count; /* size of page array */ + unsigned short pb_offset; /* page offset in first page */ + unsigned char pb_locked; /* page array is locked */ + unsigned char pb_hash_index; /* hash table index */ + struct page **pb_pages; /* array of page pointers */ + struct page *pb_page_array[PB_PAGES]; /* inline pages */ +} page_buf_t; + + +/* + * page_buf module entry points + */ + +/* Finding and Reading Buffers */ + +extern page_buf_t *pagebuf_find( /* find buffer for block if */ + /* the block is in memory */ + struct pb_target *, /* inode for block */ + loff_t, /* starting offset of range */ + size_t, /* length of range */ + page_buf_flags_t); /* PBF_LOCK */ + +extern page_buf_t *pagebuf_get( /* allocate a buffer */ + struct pb_target *, /* inode for buffer */ + loff_t, /* starting offset of range */ + size_t, /* length of range */ + page_buf_flags_t); /* PBF_LOCK, PBF_READ, PBF_ALLOCATE, */ + /* PBF_ASYNC, */ + +extern page_buf_t *pagebuf_lookup( + struct pb_target *, + struct inode *, + loff_t, /* starting offset of range */ + size_t, /* length of range */ + int); /* PBF_ENTER_PAGES */ + +extern page_buf_t *pagebuf_get_empty( /* allocate pagebuf struct with */ + /* no memory or disk address */ + struct pb_target *); /* mount point "fake" inode */ + +extern page_buf_t *pagebuf_get_no_daddr(/* allocate pagebuf struct */ + /* without disk address */ + size_t len, + struct pb_target *); /* mount point "fake" inode */ + +extern int pagebuf_associate_memory( + page_buf_t *, + void *, + size_t); + + +extern void pagebuf_hold( /* increment reference count */ + page_buf_t *); /* buffer to hold */ + +extern void pagebuf_readahead( /* read ahead into cache */ + struct pb_target *, /* target for buffer (or NULL) */ + loff_t, /* starting offset of range */ + size_t, /* length of range */ + int); /* additional read flags */ + +/* Writing and Releasing Buffers */ + +extern void pagebuf_free( /* deallocate a buffer */ + page_buf_t *); /* buffer to deallocate */ + +extern void pagebuf_rele( /* release hold on a buffer */ + page_buf_t *); /* buffer to release */ + +/* Locking and Unlocking Buffers */ + +extern int pagebuf_cond_lock( /* lock buffer, if not locked */ + /* (returns -EBUSY if locked) */ + page_buf_t *); /* buffer to lock */ + +extern int pagebuf_lock_value( /* return count on lock */ + page_buf_t *); /* buffer to check */ + +extern int pagebuf_lock( /* lock buffer */ + page_buf_t *); /* buffer to lock */ + +extern void pagebuf_lock_disable( /* disable buffer locking */ + struct pb_target *, /* inode for buffers */ + int); /* do blkdev_put? */ + +extern struct pb_target *pagebuf_lock_enable( + dev_t, + int); /* do blkdev_get? */ + +extern void pagebuf_target_blocksize( + pb_target_t *, + unsigned int); /* block size */ + +extern void pagebuf_target_clear(struct pb_target *); + +extern void pagebuf_unlock( /* unlock buffer */ + page_buf_t *); /* buffer to unlock */ + +/* Buffer Utility Routines */ + +#define pagebuf_geterror(pb) ((pb)->pb_error) + +extern void pagebuf_queue_task( + struct tq_struct *); + +extern void pagebuf_iodone( /* mark buffer I/O complete */ + page_buf_t *); /* buffer to mark */ + +extern void pagebuf_ioerror( /* mark buffer in error (or not) */ + page_buf_t *, /* buffer to mark */ + unsigned int); /* error to store (0 if none) */ + +extern int pagebuf_iostart( /* start I/O on a buffer */ + page_buf_t *, /* buffer to start */ + page_buf_flags_t); /* PBF_LOCK, PBF_ASYNC, PBF_READ, */ + /* PBF_WRITE, PBF_ALLOCATE, */ + /* PBF_DELWRI, */ + /* PBF_SYNC */ + +extern int pagebuf_iorequest( /* start real I/O */ + page_buf_t *); /* buffer to convey to device */ + + /* + * pagebuf_iorequest is the core I/O request routine. + * It assumes that the buffer is well-formed and + * mapped and ready for physical I/O, unlike + * pagebuf_iostart() and pagebuf_iophysio(). Those + * routines call the inode pagebuf_ioinitiate routine to start I/O, + * if it is present, or else call pagebuf_iorequest() + * directly if the inode pagebuf_ioinitiate routine is not present. + */ + +extern int pagebuf_iowait( /* wait for buffer I/O done */ + page_buf_t *); /* buffer to wait on */ + +extern caddr_t pagebuf_offset(page_buf_t *, off_t); + +extern void pagebuf_iomove( /* move data in/out of pagebuf */ + page_buf_t *, /* buffer to manipulate */ + off_t, /* starting buffer offset */ + size_t, /* length in buffer */ + caddr_t, /* data pointer */ + page_buf_rw_t); /* direction */ + +/* Pinning Buffer Storage in Memory */ + +extern void pagebuf_pin( /* pin buffer in memory */ + page_buf_t *); /* buffer to pin */ + +extern void pagebuf_unpin( /* unpin buffered data */ + page_buf_t *); /* buffer to unpin */ + +extern int pagebuf_ispin( page_buf_t *); /* check if pagebuf is pinned */ + +/* Reading and writing pages */ + +extern int pagebuf_write_full_page( /* write a page via pagebuf */ + struct page *, /* page to write */ + int delalloc); /* delalloc bh present */ + +extern int pagebuf_release_page( /* Attempt to convert a delalloc page */ + struct page *); /* page to release */ + +extern void pagebuf_delwri_queue(page_buf_t *, int); +extern void pagebuf_delwri_dequeue(page_buf_t *); + +#define PBDF_WAIT 0x01 +#define PBDF_TRYLOCK 0x02 +extern void pagebuf_delwri_flush( + struct pb_target *, + unsigned long, + int *); + +extern int pagebuf_init(void); +extern void pagebuf_terminate(void); + +static __inline__ int __pagebuf_iorequest(page_buf_t *pb) +{ + if (pb->pb_strat) + return pb->pb_strat(pb); + return pagebuf_iorequest(pb); +} + +#endif /* __PAGE_BUF_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/pagebuf/page_buf.lst linux-2.4-xfs/fs/xfs/pagebuf/page_buf.lst --- linux-2.4.19/fs/xfs/pagebuf/page_buf.lst Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/pagebuf/page_buf.lst Wed Aug 28 04:40:57 2002 @@ -0,0 +1,7580 @@ + +/repo/linux-2.4-xfs/fs/xfs/pagebuf/page_buf.o: file format elf32-i386 + +Disassembly of section .text: + +0000000000000000 <_bhash>: + 0: 56 push %esi + 1: 53 push %ebx + 2: 8b 4c 24 10 mov 0x10(%esp,1),%ecx + 6: 8b 5c 24 14 mov 0x14(%esp,1),%ebx + a: 0f ac d9 09 shrd $0x9,%ebx,%ecx + e: c1 fb 09 sar $0x9,%ebx + 11: 0f b7 44 24 0c movzwl 0xc(%esp,1),%eax + 16: 31 d2 xor %edx,%edx + 18: 31 c1 xor %eax,%ecx + 1a: 31 d3 xor %edx,%ebx + 1c: 31 f6 xor %esi,%esi + 1e: 89 c8 mov %ecx,%eax + 20: 09 d8 or %ebx,%eax + 22: 74 1c je 40 <_bhash+0x40> + 24: 89 c8 mov %ecx,%eax + 26: 83 e0 1f and $0x1f,%eax + 29: 31 c6 xor %eax,%esi + 2b: 0f ac d9 05 shrd $0x5,%ebx,%ecx + 2f: c1 fb 05 sar $0x5,%ebx + 32: 83 c2 05 add $0x5,%edx + 35: 89 c8 mov %ecx,%eax + 37: 09 d8 or %ebx,%eax + 39: 74 05 je 40 <_bhash+0x40> + 3b: 83 fa 3f cmp $0x3f,%edx + 3e: 76 e4 jbe 24 <_bhash+0x24> + 40: 89 f0 mov %esi,%eax + 42: 5b pop %ebx + 43: 5e pop %esi + 44: c3 ret + 45: 8d 76 00 lea 0x0(%esi),%esi + +0000000000000048 : +{ +#if SPINLOCK_DEBUG + __label__ here; +here: + if (lock->magic != SPINLOCK_MAGIC) { + 48: 81 3d 44 00 00 00 ad cmpl $0xdead4ead,0x44 + 4f: 4e ad de + 52: 74 1a je 6e +printk("eip: %p\n", &&here); + 54: 68 48 00 00 00 push $0x48 + 59: 68 2b 00 00 00 push $0x2b + 5e: e8 fc ff ff ff call 5f + BUG(); + 63: 0f 0b ud2a + 65: 85 00 test %eax,(%eax) + 67: 00 00 add %al,(%eax) + 69: 00 00 add %al,(%eax) + } + 6b: 83 c4 08 add $0x8,%esp +#endif + __asm__ __volatile__( + 6e: f0 fe 0d 40 00 00 00 lock decb 0x40 + 75: 0f 88 4e 2d 00 00 js 2dc9 +{ + a_list_t *aentry; + + spin_lock(&as_lock); + aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC); + 7b: 6a 20 push $0x20 + 7d: 6a 08 push $0x8 + 7f: e8 fc ff ff ff call 80 + 84: 89 c2 mov %eax,%edx + aentry->next = as_free_head; + 86: a1 e0 05 00 00 mov 0x5e0,%eax + 8b: 89 42 04 mov %eax,0x4(%edx) + aentry->vm_addr = addr; + 8e: 8b 44 24 0c mov 0xc(%esp,1),%eax + 92: 89 02 mov %eax,(%edx) + as_free_head = aentry; + 94: 89 15 e0 05 00 00 mov %edx,0x5e0 + as_list_len++; + 9a: ff 05 e4 05 00 00 incl 0x5e4 + :"=q" (oldval), "=m" (lock->lock) \ + :"0" (oldval) : "memory" + +static inline void spin_unlock(spinlock_t *lock) +{ + a0: 83 c4 08 add $0x8,%esp + char oldval = 1; + a3: b2 01 mov $0x1,%dl +#if SPINLOCK_DEBUG + if (lock->magic != SPINLOCK_MAGIC) + a5: 81 3d 44 00 00 00 ad cmpl $0xdead4ead,0x44 + ac: 4e ad de + af: 74 08 je b9 + BUG(); + b1: 0f 0b ud2a + b3: 69 00 00 00 00 00 imul $0x0,(%eax),%eax + if (!spin_is_locked(lock)) + b9: a0 40 00 00 00 mov 0x40,%al + be: 84 c0 test %al,%al + c0: 7e 08 jle ca + BUG(); + c2: 0f 0b ud2a + c4: 6b 00 00 imul $0x0,(%eax),%eax + c7: 00 00 add %al,(%eax) + c9: 00 86 15 40 00 00 add %al,0x4015(%esi) +#endif + __asm__ __volatile__( + cf: 00 c3 add %al,%bl + spin_unlock(&as_lock); +} + d1: 8d 76 00 lea 0x0(%esi),%esi + +00000000000000d4 : + +STATIC void +purge_addresses(void) +{ + a_list_t *aentry, *old; + + if (as_free_head == NULL) return; + d4: 56 push %esi + d5: 53 push %ebx + d6: 83 3d e0 05 00 00 00 cmpl $0x0,0x5e0 + dd: 0f 84 97 00 00 00 je 17a +{ +#if SPINLOCK_DEBUG + __label__ here; +here: + if (lock->magic != SPINLOCK_MAGIC) { + e3: 81 3d 44 00 00 00 ad cmpl $0xdead4ead,0x44 + ea: 4e ad de + ed: 74 1a je 109 +printk("eip: %p\n", &&here); + ef: 68 e3 00 00 00 push $0xe3 + f4: 68 2b 00 00 00 push $0x2b + f9: e8 fc ff ff ff call fa + BUG(); + fe: 0f 0b ud2a + 100: 85 00 test %eax,(%eax) + 102: 00 00 add %al,(%eax) + 104: 00 00 add %al,(%eax) + } + 106: 83 c4 08 add $0x8,%esp +#endif + __asm__ __volatile__( + 109: f0 fe 0d 40 00 00 00 lock decb 0x40 + 110: 0f 88 c3 2c 00 00 js 2dd9 + + spin_lock(&as_lock); + aentry = as_free_head; + 116: 8b 1d e0 05 00 00 mov 0x5e0,%ebx + as_free_head = NULL; + 11c: c7 05 e0 05 00 00 00 movl $0x0,0x5e0 + 123: 00 00 00 + as_list_len = 0; + 126: c7 05 e4 05 00 00 00 movl $0x0,0x5e4 + 12d: 00 00 00 + :"0" (oldval) : "memory" + +static inline void spin_unlock(spinlock_t *lock) +{ + char oldval = 1; + 130: b2 01 mov $0x1,%dl +#if SPINLOCK_DEBUG + if (lock->magic != SPINLOCK_MAGIC) + 132: 81 3d 44 00 00 00 ad cmpl $0xdead4ead,0x44 + 139: 4e ad de + 13c: 74 08 je 146 + BUG(); + 13e: 0f 0b ud2a + 140: 69 00 00 00 00 00 imul $0x0,(%eax),%eax + if (!spin_is_locked(lock)) + 146: a0 40 00 00 00 mov 0x40,%al + 14b: 84 c0 test %al,%al + 14d: 7e 08 jle 157 + BUG(); + 14f: 0f 0b ud2a + 151: 6b 00 00 imul $0x0,(%eax),%eax + 154: 00 00 add %al,(%eax) + 156: 00 86 15 40 00 00 add %al,0x4015(%esi) +#endif + __asm__ __volatile__( + 15c: 00 eb add %ch,%bl + spin_unlock(&as_lock); + + while ((old = aentry) != NULL) { + 15e: 15 90 8b 03 50 adc $0x50038b90,%eax + vfree(aentry->vm_addr); + 163: e8 fc ff ff ff call 164 + aentry = aentry->next; + 168: 8b 5b 04 mov 0x4(%ebx),%ebx + kfree(old); + 16b: 56 push %esi + 16c: e8 fc ff ff ff call 16d + } + 171: 83 c4 08 add $0x8,%esp + 174: 89 de mov %ebx,%esi + 176: 85 db test %ebx,%ebx + 178: 75 e6 jne 160 +} + 17a: 5b pop %ebx + 17b: 5e pop %esi + 17c: c3 ret + 17d: 8d 76 00 lea 0x0(%esi),%esi + +0000000000000180 <_pagebuf_initialize>: + +/* + * Locking model: + * + * Buffers associated with inodes for which buffer locking + * is not enabled are not protected by semaphores, and are + * assumed to be exclusively owned by the caller. There is + * spinlock in the buffer, for use by the caller when concurrent + * access is possible. + */ + +/* + * Internal pagebuf object manipulation + */ + +STATIC void +_pagebuf_initialize( + page_buf_t *pb, + pb_target_t *target, + loff_t range_base, + size_t range_length, + page_buf_flags_t flags) +{ + 180: 55 push %ebp + 181: 57 push %edi + 182: 56 push %esi + 183: 53 push %ebx + 184: 8b 74 24 24 mov 0x24(%esp,1),%esi + /* + * We don't want certain flags to appear in pb->pb_flags. + */ + flags &= ~(PBF_LOCK|PBF_ENTER_PAGES|PBF_MAPPED); + flags &= ~(PBF_DONT_BLOCK|PBF_READ_AHEAD); + 188: 8b 6c 24 28 mov 0x28(%esp,1),%ebp + 18c: 81 e5 fb df dd df and $0xdfdddffb,%ebp + * This looks horribly ugly, but the compiler can optimize it totally, + * as we by now know that both pattern and count is constant.. + */ +static inline void * __constant_c_and_count_memset(void * s, unsigned long pattern, size_t count) +{ + 192: 31 c0 xor %eax,%eax + switch (count) { + case 0: + return s; + case 1: + *(unsigned char *)s = pattern; + return s; + case 2: + *(unsigned short *)s = pattern; + return s; + case 3: + *(unsigned short *)s = pattern; + *(2+(unsigned char *)s) = pattern; + return s; + case 4: + *(unsigned long *)s = pattern; + return s; + } +#define COMMON(x) \ +__asm__ __volatile__( \ + "rep ; stosl" \ + x \ + : "=&c" (d0), "=&D" (d1) \ + : "a" (pattern),"0" (count/4),"1" ((long) s) \ + : "memory") +{ + int d0, d1; + switch (count % 4) { + case 0: COMMON(""); return s; + 194: 8b 7c 24 14 mov 0x14(%esp,1),%edi + 198: b9 33 00 00 00 mov $0x33,%ecx + 19d: f3 ab repz stos %eax,%es:(%edi) + + pb_tracking_get(pb); + + memset(pb, 0, sizeof(page_buf_private_t)); + atomic_set(&pb->pb_hold, 1); + 19f: 8b 44 24 14 mov 0x14(%esp,1),%eax + 1a3: c7 40 18 01 00 00 00 movl $0x1,0x18(%eax) + * + * i'd rather use the more flexible initialization above, but sadly + * GCC 2.7.2.3 emits a bogus warning. EGCS doesnt. Oh well. + */ + atomic_set(&sem->count, val); + 1aa: c7 40 58 00 00 00 00 movl $0x0,0x58(%eax) + sem->sleepers = 0; + 1b1: c7 40 5c 00 00 00 00 movl $0x0,0x5c(%eax) +#if WAITQUEUE_DEBUG + if (!q) + WQ_BUG(); +#endif + q->lock = WAITQUEUE_RW_LOCK_UNLOCKED; + 1b8: 8b 5c 24 14 mov 0x14(%esp,1),%ebx + 1bc: 89 c1 mov %eax,%ecx + 1be: 83 c1 60 add $0x60,%ecx + 1c1: b8 01 00 00 00 mov $0x1,%eax + 1c6: ba ad 4e ad de mov $0xdead4ead,%edx + 1cb: 89 43 60 mov %eax,0x60(%ebx) + 1ce: 89 53 64 mov %edx,0x64(%ebx) + INIT_LIST_HEAD(&q->task_list); + 1d1: 89 d8 mov %ebx,%eax + 1d3: 83 c0 68 add $0x68,%eax + 1d6: 89 41 08 mov %eax,0x8(%ecx) + 1d9: 89 41 0c mov %eax,0xc(%ecx) + init_MUTEX_LOCKED(&pb->pb_iodonesema); + INIT_LIST_HEAD(&pb->pb_list); + 1dc: 89 1b mov %ebx,(%ebx) + 1de: 89 5b 04 mov %ebx,0x4(%ebx) + INIT_LIST_HEAD(&pb->pb_hash_list); + 1e1: 83 c0 a4 add $0xffffffa4,%eax + 1e4: 89 43 0c mov %eax,0xc(%ebx) + 1e7: 89 43 10 mov %eax,0x10(%ebx) + * + * i'd rather use the more flexible initialization above, but sadly + * GCC 2.7.2.3 emits a bogus warning. EGCS doesnt. Oh well. + */ + atomic_set(&sem->count, val); + 1ea: c7 83 98 00 00 00 00 movl $0x0,0x98(%ebx) + 1f1: 00 00 00 + sem->sleepers = 0; + 1f4: c7 83 9c 00 00 00 00 movl $0x0,0x9c(%ebx) + 1fb: 00 00 00 +#if WAITQUEUE_DEBUG + if (!q) + WQ_BUG(); +#endif + q->lock = WAITQUEUE_RW_LOCK_UNLOCKED; + 1fe: 8b 7c 24 14 mov 0x14(%esp,1),%edi + 202: 89 d9 mov %ebx,%ecx + 204: 81 c1 a0 00 00 00 add $0xa0,%ecx + 20a: b8 01 00 00 00 mov $0x1,%eax + 20f: 89 87 a0 00 00 00 mov %eax,0xa0(%edi) + 215: 89 97 a4 00 00 00 mov %edx,0xa4(%edi) + INIT_LIST_HEAD(&q->task_list); + 21b: 89 f8 mov %edi,%eax + 21d: 05 a8 00 00 00 add $0xa8,%eax + 222: 89 41 08 mov %eax,0x8(%ecx) + 225: 89 41 0c mov %eax,0xc(%ecx) + init_MUTEX_LOCKED(&PBP(pb)->pb_sema); /* held, no waiters */ + PB_SET_OWNER(pb); + pb->pb_target = target; + 228: 8b 44 24 18 mov 0x18(%esp,1),%eax + 22c: 89 47 14 mov %eax,0x14(%edi) + pb->pb_file_offset = range_base; + 22f: 8b 44 24 1c mov 0x1c(%esp,1),%eax + 233: 8b 54 24 20 mov 0x20(%esp,1),%edx + 237: 8b 4c 24 14 mov 0x14(%esp,1),%ecx + 23b: 89 41 24 mov %eax,0x24(%ecx) + 23e: 89 51 28 mov %edx,0x28(%ecx) + /* + * Set buffer_length and count_desired to the same value initially. + * IO routines should use count_desired, which will be the same in + * most cases but may be reset (e.g. XFS recovery). + */ + pb->pb_buffer_length = pb->pb_count_desired = range_length; + 241: 89 71 30 mov %esi,0x30(%ecx) + 244: 89 71 2c mov %esi,0x2c(%ecx) + pb->pb_flags = flags | PBF_NONE; + 247: 83 cd 20 or $0x20,%ebp + 24a: 89 69 08 mov %ebp,0x8(%ecx) + pb->pb_bn = PAGE_BUF_DADDR_NULL; + 24d: 8b 5c 24 14 mov 0x14(%esp,1),%ebx + 251: c7 43 1c ff ff ff ff movl $0xffffffff,0x1c(%ebx) + 258: c7 43 20 ff ff ff ff movl $0xffffffff,0x20(%ebx) + atomic_set(&PBP(pb)->pb_pin_count, 0); + 25f: c7 83 b8 00 00 00 00 movl $0x0,0xb8(%ebx) + 266: 00 00 00 +#if WAITQUEUE_DEBUG + if (!q) + WQ_BUG(); +#endif + q->lock = WAITQUEUE_RW_LOCK_UNLOCKED; + 269: 8b 7c 24 14 mov 0x14(%esp,1),%edi + 26d: 89 d9 mov %ebx,%ecx + 26f: 81 c1 bc 00 00 00 add $0xbc,%ecx + 275: b8 01 00 00 00 mov $0x1,%eax + 27a: ba ad 4e ad de mov $0xdead4ead,%edx + 27f: 89 87 bc 00 00 00 mov %eax,0xbc(%edi) + 285: 89 97 c0 00 00 00 mov %edx,0xc0(%edi) + INIT_LIST_HEAD(&q->task_list); + 28b: 89 f8 mov %edi,%eax + 28d: 05 c4 00 00 00 add $0xc4,%eax + 292: 89 41 08 mov %eax,0x8(%ecx) + 295: 89 41 0c mov %eax,0xc(%ecx) + init_waitqueue_head(&PBP(pb)->pb_waiters); + + PB_STATS_INC(pbstats.pb_create); + 298: ff 05 04 00 00 00 incl 0x4 + 29e: 5b pop %ebx + 29f: 5e pop %esi + 2a0: 5f pop %edi + 2a1: 5d pop %ebp + 2a2: c3 ret + PB_TRACE(pb, PB_TRACE_REC(get), target); +} + 2a3: 90 nop + +00000000000002a4 <_pagebuf_get_pages>: + +/* + * Allocate a page array capable of holding a specified number + * of pages, and point the page buf at it. + */ +STATIC int +_pagebuf_get_pages( + page_buf_t *pb, + int page_count, + int flags) +{ + 2a4: 57 push %edi + 2a5: 53 push %ebx + 2a6: 8b 7c 24 0c mov 0xc(%esp,1),%edi + 2aa: 8b 5c 24 10 mov 0x10(%esp,1),%ebx + 2ae: 8b 54 24 14 mov 0x14(%esp,1),%edx + int gpf_mask = pb_to_gfp(flags); + 2b2: f7 c2 00 00 00 20 test $0x20000000,%edx + 2b8: 75 17 jne 2d1 <_pagebuf_get_pages+0x2d> + 2ba: 81 e2 00 00 02 00 and $0x20000,%edx + 2c0: b9 f0 01 00 00 mov $0x1f0,%ecx + 2c5: b8 f0 00 00 00 mov $0xf0,%eax + 2ca: 85 d2 test %edx,%edx + 2cc: 0f 45 c8 cmovne %eax,%ecx + 2cf: eb 02 jmp 2d3 <_pagebuf_get_pages+0x2f> + 2d1: 31 c9 xor %ecx,%ecx + + /* Make sure that we have a page list */ + if (pb->pb_pages == NULL) { + 2d3: 83 bf 84 00 00 00 00 cmpl $0x0,0x84(%edi) + 2da: 75 66 jne 342 <_pagebuf_get_pages+0x9e> + pb->pb_offset = page_buf_poff(pb->pb_file_offset); + 2dc: 0f b7 47 24 movzwl 0x24(%edi),%eax + 2e0: 80 e4 0f and $0xf,%ah + 2e3: 66 89 87 80 00 00 00 mov %ax,0x80(%edi) + pb->pb_page_count = page_count; + 2ea: 66 89 5f 7e mov %bx,0x7e(%edi) + if (page_count <= PB_PAGES) { + 2ee: 83 fb 04 cmp $0x4,%ebx + 2f1: 7e 20 jle 313 <_pagebuf_get_pages+0x6f> + pb->pb_pages = pb->pb_page_array; + } else { + pb->pb_pages = kmalloc(sizeof(struct page *) * + 2f3: 51 push %ecx + 2f4: c1 e3 02 shl $0x2,%ebx + 2f7: 53 push %ebx + 2f8: e8 fc ff ff ff call 2f9 <_pagebuf_get_pages+0x55> + 2fd: 89 87 84 00 00 00 mov %eax,0x84(%edi) + page_count, gpf_mask); + if (pb->pb_pages == NULL) + 303: 83 c4 08 add $0x8,%esp + 306: 89 da mov %ebx,%edx + 308: 85 c0 test %eax,%eax + 30a: 75 1a jne 326 <_pagebuf_get_pages+0x82> + return -ENOMEM; + 30c: b8 f4 ff ff ff mov $0xfffffff4,%eax + 311: eb 31 jmp 344 <_pagebuf_get_pages+0xa0> + 313: 8d 87 88 00 00 00 lea 0x88(%edi),%eax + 319: 89 87 84 00 00 00 mov %eax,0x84(%edi) + 31f: 8d 14 9d 00 00 00 00 lea 0x0(,%ebx,4),%edx + * things 32 bits at a time even when we don't know the size of the + * area at compile-time.. + */ +static inline void * __constant_c_memset(void * s, unsigned long c, size_t count) +{ + 326: 8b bf 84 00 00 00 mov 0x84(%edi),%edi + } + memset(pb->pb_pages, 0, sizeof(struct page *) * page_count); + 32c: 31 c0 xor %eax,%eax + */ +static inline void * __constant_c_memset(void * s, unsigned long c, size_t count) +{ +int d0, d1; +__asm__ __volatile__( + 32e: 89 d1 mov %edx,%ecx + 330: c1 e9 02 shr $0x2,%ecx + 333: f3 ab repz stos %eax,%es:(%edi) + 335: f6 c2 02 test $0x2,%dl + 338: 74 02 je 33c <_pagebuf_get_pages+0x98> + 33a: 66 ab stos %ax,%es:(%edi) + 33c: f6 c2 01 test $0x1,%dl + 33f: 74 01 je 342 <_pagebuf_get_pages+0x9e> + 341: aa stos %al,%es:(%edi) + } + return 0; + 342: 31 c0 xor %eax,%eax + 344: 5b pop %ebx + 345: 5f pop %edi + 346: c3 ret +} + 347: 90 nop + +0000000000000348 <_pagebuf_free_object>: + +/* + * Walk a pagebuf releasing all the pages contained within it. + */ +STATIC inline void +_pagebuf_freepages( + page_buf_t *pb) +{ + int buf_index; + + for (buf_index = 0; buf_index < pb->pb_page_count; buf_index++) { + struct page *page = pb->pb_pages[buf_index]; + + if (page) { + pb->pb_pages[buf_index] = NULL; + page_cache_release(page); + } + } + + if (pb->pb_pages != pb->pb_page_array) + kfree(pb->pb_pages); +} + +/* + * _pagebuf_free_object + * + * _pagebuf_free_object releases the contents specified buffer. + * The modification state of any associated pages is left unchanged. + */ +void +_pagebuf_free_object( + pb_hash_t *hash, /* hash bucket for buffer */ + page_buf_t *pb) /* buffer to deallocate */ +{ + 348: 57 push %edi + 349: 56 push %esi + 34a: 53 push %ebx + 34b: 8b 74 24 14 mov 0x14(%esp,1),%esi + 34f: 8b 5c 24 10 mov 0x10(%esp,1),%ebx + int pb_flags = pb->pb_flags; + 353: 8b 7e 08 mov 0x8(%esi),%edi + + PB_TRACE(pb, PB_TRACE_REC(free_obj), 0); + pb->pb_flags |= PBF_FREED; + 356: 89 f8 mov %edi,%eax + 358: 0c 80 or $0x80,%al + 35a: 89 46 08 mov %eax,0x8(%esi) + + if (hash) { + 35d: 85 db test %ebx,%ebx + 35f: 74 44 je 3a5 <_pagebuf_free_object+0x5d> + * list_empty - tests whether a list is empty + * @head: the list to test. + */ +static __inline__ int list_empty(struct list_head *head) +{ + 361: 8d 4e 0c lea 0xc(%esi),%ecx + if (!list_empty(&pb->pb_hash_list)) { + 364: 39 4e 0c cmp %ecx,0xc(%esi) + 367: 74 17 je 380 <_pagebuf_free_object+0x38> + hash->pb_count--; + 369: ff 4b 08 decl 0x8(%ebx) + * the prev/next entries already! + */ +static __inline__ void __list_del(struct list_head * prev, + struct list_head * next) +{ + 36c: 8b 51 04 mov 0x4(%ecx),%edx + 36f: 8b 46 0c mov 0xc(%esi),%eax + next->prev = prev; + 372: 89 50 04 mov %edx,0x4(%eax) + prev->next = next; + 375: 89 02 mov %eax,(%edx) +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty on entry does not return true after this, the entry is in an undefined state. + */ +static __inline__ void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +/** + * list_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static __inline__ void list_del_init(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); + 377: 89 4e 0c mov %ecx,0xc(%esi) + 37a: 89 49 04 mov %ecx,0x4(%ecx) +} + 37d: 8d 76 00 lea 0x0(%esi),%esi + :"0" (oldval) : "memory" + +static inline void spin_unlock(spinlock_t *lock) +{ + char oldval = 1; + 380: b2 01 mov $0x1,%dl +#if SPINLOCK_DEBUG + if (lock->magic != SPINLOCK_MAGIC) + 382: 81 7b 10 ad 4e ad de cmpl $0xdead4ead,0x10(%ebx) + 389: 74 08 je 393 <_pagebuf_free_object+0x4b> + BUG(); + 38b: 0f 0b ud2a + 38d: 69 00 00 00 00 00 imul $0x0,(%eax),%eax + if (!spin_is_locked(lock)) + 393: 8a 43 0c mov 0xc(%ebx),%al + 396: 84 c0 test %al,%al + 398: 7e 08 jle 3a2 <_pagebuf_free_object+0x5a> + BUG(); + 39a: 0f 0b ud2a + 39c: 6b 00 00 imul $0x0,(%eax),%eax + 39f: 00 00 add %al,(%eax) + 3a1: 00 86 53 0c 89 f8 add %al,0xf8890c53(%esi) + list_del_init(&pb->pb_hash_list); + } + spin_unlock(&hash->pb_hash_lock); + } + + if (!(pb_flags & PBF_FREED)) { + 3a7: 84 c0 test %al,%al + 3a9: 0f 8c 96 00 00 00 jl 445 <_pagebuf_free_object+0xfd> + /* release any virtual mapping */ ; + if (pb->pb_flags & _PBF_ADDR_ALLOCATED) { + 3af: f6 46 0b 01 testb $0x1,0xb(%esi) + 3b3: 74 16 je 3cb <_pagebuf_free_object+0x83> + void *vaddr = pagebuf_mapout_locked(pb); + 3b5: 56 push %esi + 3b6: e8 39 1b 00 00 call 1ef4 + if (vaddr) { + 3bb: 83 c4 04 add $0x4,%esp + 3be: 85 c0 test %eax,%eax + 3c0: 74 09 je 3cb <_pagebuf_free_object+0x83> + free_address(vaddr); + 3c2: 50 push %eax + 3c3: e8 80 fc ff ff call 48 + } + 3c8: 83 c4 04 add $0x4,%esp + } + + if (pb->pb_flags & _PBF_MEM_ALLOCATED) { + 3cb: f6 46 0b 02 testb $0x2,0xb(%esi) + 3cf: 74 74 je 445 <_pagebuf_free_object+0xfd> + if (pb->pb_pages) { + 3d1: 8b 96 84 00 00 00 mov 0x84(%esi),%edx + 3d7: 85 d2 test %edx,%edx + 3d9: 74 66 je 441 <_pagebuf_free_object+0xf9> + /* release the pages in the address list */ + if (pb->pb_pages[0] && + 3db: 8b 02 mov (%edx),%eax + 3dd: 85 c0 test %eax,%eax + 3df: 74 0f je 3f0 <_pagebuf_free_object+0xa8> +#endif + +static __inline__ int constant_test_bit(int nr, const volatile void * addr) +{ + return ((1UL << (nr & 31)) & (((const volatile unsigned int *) addr)[nr >> 5])) != 0; + 3e1: 8b 40 18 mov 0x18(%eax),%eax + PageSlab(pb->pb_pages[0])) { + 3e4: f6 c4 01 test $0x1,%ah + 3e7: 74 07 je 3f0 <_pagebuf_free_object+0xa8> + /* + * This came from the slab + * allocator free it as such + */ + kfree(pb->pb_addr); + 3e9: 8b 46 34 mov 0x34(%esi),%eax + 3ec: 50 push %eax + } else { + 3ed: eb 40 jmp 42f <_pagebuf_free_object+0xe7> + 3ef: 90 nop + 3f0: 31 db xor %ebx,%ebx + 3f2: 8d be 88 00 00 00 lea 0x88(%esi),%edi + 3f8: 66 83 7e 7e 00 cmpw $0x0,0x7e(%esi) + 3fd: 74 2b je 42a <_pagebuf_free_object+0xe2> + 3ff: 90 nop + 400: 8b 96 84 00 00 00 mov 0x84(%esi),%edx + 406: 8b 04 9a mov (%edx,%ebx,4),%eax + 409: 85 c0 test %eax,%eax + 40b: 74 0e je 41b <_pagebuf_free_object+0xd3> + 40d: c7 04 9a 00 00 00 00 movl $0x0,(%edx,%ebx,4) + 414: 31 d2 xor %edx,%edx + 416: e8 fc ff ff ff call 417 <_pagebuf_free_object+0xcf> + 41b: 8b 96 84 00 00 00 mov 0x84(%esi),%edx + 421: 43 inc %ebx + 422: 0f b7 46 7e movzwl 0x7e(%esi),%eax + 426: 39 c3 cmp %eax,%ebx + 428: 7c d6 jl 400 <_pagebuf_free_object+0xb8> + 42a: 39 fa cmp %edi,%edx + 42c: 74 09 je 437 <_pagebuf_free_object+0xef> + 42e: 52 push %edx + 42f: e8 fc ff ff ff call 430 <_pagebuf_free_object+0xe8> + 434: 83 c4 04 add $0x4,%esp + _pagebuf_freepages(pb); + } + + pb->pb_pages = NULL; + 437: c7 86 84 00 00 00 00 movl $0x0,0x84(%esi) + 43e: 00 00 00 + } + pb->pb_flags &= ~_PBF_MEM_ALLOCATED; + 441: 80 66 0b fd andb $0xfd,0xb(%esi) + } + } + + pb_tracking_free(pb); + pagebuf_deallocate(pb); + 445: a1 24 00 00 00 mov 0x24,%eax + 44a: 56 push %esi + 44b: 50 push %eax + 44c: e8 fc ff ff ff call 44d <_pagebuf_free_object+0x105> +} + 451: 83 c4 08 add $0x8,%esp + 454: 5b pop %ebx + 455: 5e pop %esi + 456: 5f pop %edi + 457: c3 ret + +0000000000000458 <_pagebuf_lookup_pages>: + +/* + * _pagebuf_lookup_pages + * + * _pagebuf_lookup_pages finds all pages which match the buffer + * in question and the range of file offsets supplied, + * and builds the page list for the buffer, if the + * page list is not already formed or if not all of the pages are + * already in the list. Invalid pages (pages which have not yet been + * read in from disk) are assigned for any pages which are not found. + */ +STATIC int +_pagebuf_lookup_pages( + page_buf_t *pb, + struct address_space *aspace, + page_buf_flags_t flags) +{ + 458: 83 ec 28 sub $0x28,%esp + 45b: 55 push %ebp + 45c: 57 push %edi + 45d: 56 push %esi + 45e: 53 push %ebx + 45f: 8b 7c 24 3c mov 0x3c(%esp,1),%edi + loff_t next_buffer_offset; + unsigned long page_count, pi, index; + struct page *page; + int gfp_mask, retry_count = 5, rval = 0; + int all_mapped, good_pages; + size_t blocksize; + + /* For pagebufs where we want to map an address, do not use + * highmem pages - so that we do not need to use kmap resources + * to access the data. + * + * For pages where the caller has indicated there may be resource + * contention (e.g. called from a transaction) do not flush + * delalloc pages to obtain memory. + */ + + if (flags & PBF_READ_AHEAD) { + 463: 8b 44 24 44 mov 0x44(%esp,1),%eax + 467: c7 44 24 2c 05 00 00 movl $0x5,0x2c(%esp,1) + 46e: 00 + 46f: c7 44 24 28 00 00 00 movl $0x0,0x28(%esp,1) + 476: 00 + 477: a9 00 00 00 20 test $0x20000000,%eax + 47c: 74 12 je 490 <_pagebuf_lookup_pages+0x38> + gfp_mask = GFP_READAHEAD; + 47e: c7 44 24 30 00 00 00 movl $0x0,0x30(%esp,1) + 485: 00 + retry_count = 0; + 486: c7 44 24 2c 00 00 00 movl $0x0,0x2c(%esp,1) + 48d: 00 + } else if (flags & PBF_DONT_BLOCK) { + 48e: eb 37 jmp 4c7 <_pagebuf_lookup_pages+0x6f> + 490: 8b 54 24 44 mov 0x44(%esp,1),%edx + 494: f7 c2 00 00 02 00 test $0x20000,%edx + 49a: 74 0a je 4a6 <_pagebuf_lookup_pages+0x4e> + gfp_mask = GFP_NOFS; + 49c: c7 44 24 30 f0 00 00 movl $0xf0,0x30(%esp,1) + 4a3: 00 + } else if (flags & PBF_MAPPABLE) { + 4a4: eb 21 jmp 4c7 <_pagebuf_lookup_pages+0x6f> + 4a6: 8b 44 24 44 mov 0x44(%esp,1),%eax + 4aa: 25 00 02 00 00 and $0x200,%eax + gfp_mask = GFP_KERNEL; + } else { + gfp_mask = GFP_HIGHUSER; + 4af: c7 44 24 30 d2 01 00 movl $0x1d2,0x30(%esp,1) + 4b6: 00 + 4b7: ba f0 01 00 00 mov $0x1f0,%edx + 4bc: 85 c0 test %eax,%eax + 4be: 0f 44 54 24 30 cmove 0x30(%esp,1),%edx + 4c3: 89 54 24 30 mov %edx,0x30(%esp,1) + } + + next_buffer_offset = pb->pb_file_offset + pb->pb_buffer_length; + 4c7: 8b 4f 24 mov 0x24(%edi),%ecx + 4ca: 8b 5f 28 mov 0x28(%edi),%ebx + 4cd: 89 c8 mov %ecx,%eax + 4cf: 89 da mov %ebx,%edx + 4d1: 03 47 2c add 0x2c(%edi),%eax + 4d4: 83 d2 00 adc $0x0,%edx + + good_pages = page_count = (page_buf_btoc(next_buffer_offset) - + 4d7: 05 ff 0f 00 00 add $0xfff,%eax + 4dc: 83 d2 00 adc $0x0,%edx + 4df: 0f ac d0 0c shrd $0xc,%edx,%eax + 4e3: c1 fa 0c sar $0xc,%edx + 4e6: 0f ac d9 0c shrd $0xc,%ebx,%ecx + 4ea: c1 fb 0c sar $0xc,%ebx + 4ed: 29 c8 sub %ecx,%eax + 4ef: 89 44 24 34 mov %eax,0x34(%esp,1) + 4f3: 89 44 24 20 mov %eax,0x20(%esp,1) + page_buf_btoct(pb->pb_file_offset)); + + if (pb->pb_flags & _PBF_ALL_PAGES_MAPPED) { + 4f7: f6 47 0a 40 testb $0x40,0xa(%edi) + 4fb: 74 4a je 547 <_pagebuf_lookup_pages+0xef> + /* Bring pages forward in cache */ + for (pi = 0; pi < page_count; pi++) { + 4fd: 8b 5c 24 44 mov 0x44(%esp,1),%ebx + 501: 31 f6 xor %esi,%esi + 503: 83 e3 04 and $0x4,%ebx + 506: 39 c6 cmp %eax,%esi + 508: 73 1b jae 525 <_pagebuf_lookup_pages+0xcd> + 50a: 8d b6 00 00 00 00 lea 0x0(%esi),%esi + mark_page_accessed(pb->pb_pages[pi]); + 510: 8b 87 84 00 00 00 mov 0x84(%edi),%eax + 516: 8b 04 b0 mov (%eax,%esi,4),%eax + 519: e8 fc ff ff ff call 51a <_pagebuf_lookup_pages+0xc2> + 51e: 46 inc %esi + 51f: 3b 74 24 34 cmp 0x34(%esp,1),%esi + 523: 72 eb jb 510 <_pagebuf_lookup_pages+0xb8> + } + if ((flags & PBF_MAPPED) && !(pb->pb_flags & PBF_MAPPED)) { + 525: 85 db test %ebx,%ebx + 527: 74 17 je 540 <_pagebuf_lookup_pages+0xe8> + 529: f6 47 08 04 testb $0x4,0x8(%edi) + 52d: 75 11 jne 540 <_pagebuf_lookup_pages+0xe8> + all_mapped = 1; + 52f: c7 44 24 24 01 00 00 movl $0x1,0x24(%esp,1) + 536: 00 + goto mapit; + 537: e9 6e 01 00 00 jmp 6aa <_pagebuf_lookup_pages+0x252> + 53c: 8d 74 26 00 lea 0x0(%esi,1),%esi + } + return 0; + 540: 31 c0 xor %eax,%eax + 542: e9 21 02 00 00 jmp 768 <_pagebuf_lookup_pages+0x310> + } + + /* Ensure pb_pages field has been initialised */ + rval = _pagebuf_get_pages(pb, page_count, flags); + 547: 8b 4c 24 44 mov 0x44(%esp,1),%ecx + 54b: 51 push %ecx + 54c: 8b 5c 24 38 mov 0x38(%esp,1),%ebx + 550: 53 push %ebx + 551: 57 push %edi + 552: e8 4d fd ff ff call 2a4 <_pagebuf_get_pages> + 557: 89 44 24 34 mov %eax,0x34(%esp,1) + if (rval) + 55b: 83 c4 0c add $0xc,%esp + 55e: 85 c0 test %eax,%eax + 560: 0f 85 fe 01 00 00 jne 764 <_pagebuf_lookup_pages+0x30c> + return rval; + + rval = pi = 0; + blocksize = pb->pb_target->pbr_blocksize; + 566: 8b 47 14 mov 0x14(%edi),%eax + 569: 31 f6 xor %esi,%esi + 56b: 8b 40 10 mov 0x10(%eax),%eax + 56e: 89 44 24 1c mov %eax,0x1c(%esp,1) + + /* Enter the pages in the page list */ + index = (pb->pb_file_offset - pb->pb_offset) >> PAGE_CACHE_SHIFT; + 572: 0f b7 87 80 00 00 00 movzwl 0x80(%edi),%eax + 579: 8b 4f 24 mov 0x24(%edi),%ecx + 57c: 8b 5f 28 mov 0x28(%edi),%ebx + 57f: 29 c1 sub %eax,%ecx + 581: 83 db 00 sbb $0x0,%ebx + 584: 89 da mov %ebx,%edx + for (all_mapped = 1; pi < page_count; pi++, index++) { + 586: 8b 5c 24 34 mov 0x34(%esp,1),%ebx + 58a: 89 c8 mov %ecx,%eax + 58c: 0f ac d0 0c shrd $0xc,%edx,%eax + 590: c1 fa 0c sar $0xc,%edx + 593: 89 c5 mov %eax,%ebp + 595: c7 44 24 24 01 00 00 movl $0x1,0x24(%esp,1) + 59c: 00 + 59d: 39 5c 24 28 cmp %ebx,0x28(%esp,1) + 5a1: 0f 83 dd 00 00 00 jae 684 <_pagebuf_lookup_pages+0x22c> + 5a7: b8 00 e0 ff ff mov $0xffffe000,%eax + 5ac: 21 e0 and %esp,%eax + 5ae: 89 44 24 18 mov %eax,0x18(%esp,1) + if (pb->pb_pages[pi] == 0) { + 5b2: 8b 87 84 00 00 00 mov 0x84(%edi),%eax + 5b8: 8d 56 01 lea 0x1(%esi),%edx + 5bb: 8d 4d 01 lea 0x1(%ebp),%ecx + 5be: 8b 04 b0 mov (%eax,%esi,4),%eax + 5c1: 89 54 24 14 mov %edx,0x14(%esp,1) + 5c5: 89 4c 24 10 mov %ecx,0x10(%esp,1) + 5c9: 85 c0 test %eax,%eax + 5cb: 75 7b jne 648 <_pagebuf_lookup_pages+0x1f0> + retry: + page = find_or_create_page(aspace, index, gfp_mask); + 5cd: 8b 5c 24 30 mov 0x30(%esp,1),%ebx + 5d1: 53 push %ebx + 5d2: 55 push %ebp + 5d3: 8b 44 24 48 mov 0x48(%esp,1),%eax + 5d7: 50 push %eax + 5d8: e8 fc ff ff ff call 5d9 <_pagebuf_lookup_pages+0x181> + 5dd: 89 c3 mov %eax,%ebx + if (!page) { + 5df: 83 c4 0c add $0xc,%esp + 5e2: 85 db test %ebx,%ebx + 5e4: 75 4a jne 630 <_pagebuf_lookup_pages+0x1d8> + if (--retry_count > 0) { + 5e6: ff 4c 24 2c decl 0x2c(%esp,1) + 5ea: 83 7c 24 2c 00 cmpl $0x0,0x2c(%esp,1) + 5ef: 7e 26 jle 617 <_pagebuf_lookup_pages+0x1bf> + PB_STATS_INC(pbstats.pb_page_retries); + 5f1: ff 05 18 00 00 00 incl 0x18 + pagebuf_daemon_wakeup(1); + 5f7: 6a 01 push $0x1 + 5f9: e8 8e 1f 00 00 call 258c + +struct task_struct; + +static inline struct task_struct * get_current(void) +{ + 5fe: 83 c4 04 add $0x4,%esp + current->state = TASK_UNINTERRUPTIBLE; + 601: 8b 54 24 18 mov 0x18(%esp,1),%edx + 605: c7 02 02 00 00 00 movl $0x2,(%edx) + schedule_timeout(10); + 60b: b8 0a 00 00 00 mov $0xa,%eax + 610: e8 fc ff ff ff call 611 <_pagebuf_lookup_pages+0x1b9> + goto retry; + 615: eb b6 jmp 5cd <_pagebuf_lookup_pages+0x175> + } + rval = -ENOMEM; + 617: c7 44 24 28 f4 ff ff movl $0xfffffff4,0x28(%esp,1) + 61e: ff + all_mapped = 0; + 61f: c7 44 24 24 00 00 00 movl $0x0,0x24(%esp,1) + 626: 00 + continue; + 627: eb 49 jmp 672 <_pagebuf_lookup_pages+0x21a> + 629: 8d b4 26 00 00 00 00 lea 0x0(%esi,1),%esi + } + PB_STATS_INC(pbstats.pb_page_found); + 630: ff 05 1c 00 00 00 incl 0x1c + mark_page_accessed(page); + 636: 89 d8 mov %ebx,%eax + 638: e8 fc ff ff ff call 639 <_pagebuf_lookup_pages+0x1e1> + pb->pb_pages[pi] = page; + 63d: 8b 87 84 00 00 00 mov 0x84(%edi),%eax + 643: 89 1c b0 mov %ebx,(%eax,%esi,4) + } else { + 646: eb 07 jmp 64f <_pagebuf_lookup_pages+0x1f7> + page = pb->pb_pages[pi]; + 648: 89 c3 mov %eax,%ebx + lock_page(page); + 64a: e8 fc ff ff ff call 64b <_pagebuf_lookup_pages+0x1f3> +#endif + +static __inline__ int constant_test_bit(int nr, const volatile void * addr) +{ + return ((1UL << (nr & 31)) & (((const volatile unsigned int *) addr)[nr >> 5])) != 0; + 64f: 8b 43 18 mov 0x18(%ebx),%eax + } + + /* If we need to do I/O on a page record the fact */ + if (!Page_Uptodate(page)) { + 652: a8 08 test $0x8,%al + 654: 75 1c jne 672 <_pagebuf_lookup_pages+0x21a> + good_pages--; + 656: ff 4c 24 20 decl 0x20(%esp,1) + if ((blocksize == PAGE_CACHE_SIZE) && + 65a: 81 7c 24 1c 00 10 00 cmpl $0x1000,0x1c(%esp,1) + 661: 00 + 662: 75 0e jne 672 <_pagebuf_lookup_pages+0x21a> + 664: f6 44 24 44 01 testb $0x1,0x44(%esp,1) + 669: 74 07 je 672 <_pagebuf_lookup_pages+0x21a> + (flags & PBF_READ)) + pb->pb_locked = 1; + 66b: c6 87 82 00 00 00 01 movb $0x1,0x82(%edi) + 672: 8b 74 24 14 mov 0x14(%esp,1),%esi + 676: 8b 6c 24 10 mov 0x10(%esp,1),%ebp + 67a: 3b 74 24 34 cmp 0x34(%esp,1),%esi + 67e: 0f 82 2e ff ff ff jb 5b2 <_pagebuf_lookup_pages+0x15a> + } + } + + if (!pb->pb_locked) { + 684: 80 bf 82 00 00 00 00 cmpb $0x0,0x82(%edi) + 68b: 75 1d jne 6aa <_pagebuf_lookup_pages+0x252> + for (pi = 0; pi < page_count; pi++) { + 68d: 31 f6 xor %esi,%esi + 68f: 3b 74 24 34 cmp 0x34(%esp,1),%esi + 693: 73 15 jae 6aa <_pagebuf_lookup_pages+0x252> + unlock_page(pb->pb_pages[pi]); + 695: 8b 87 84 00 00 00 mov 0x84(%edi),%eax + 69b: 8b 04 b0 mov (%eax,%esi,4),%eax + 69e: e8 fc ff ff ff call 69f <_pagebuf_lookup_pages+0x247> + 6a3: 46 inc %esi + 6a4: 3b 74 24 34 cmp 0x34(%esp,1),%esi + 6a8: 72 eb jb 695 <_pagebuf_lookup_pages+0x23d> + } + } + +mapit: + pb->pb_flags |= _PBF_MEM_ALLOCATED; + 6aa: 8b 47 08 mov 0x8(%edi),%eax + 6ad: 89 c1 mov %eax,%ecx + 6af: 81 c9 00 00 00 02 or $0x2000000,%ecx + 6b5: 89 4f 08 mov %ecx,0x8(%edi) + if (all_mapped) { + 6b8: 83 7c 24 24 00 cmpl $0x0,0x24(%esp,1) + 6bd: 0f 84 83 00 00 00 je 746 <_pagebuf_lookup_pages+0x2ee> + pb->pb_flags |= _PBF_ALL_PAGES_MAPPED; + 6c3: 89 c1 mov %eax,%ecx + 6c5: 81 c9 00 00 40 02 or $0x2400000,%ecx + 6cb: 89 4f 08 mov %ecx,0x8(%edi) + + /* A single page buffer is always mappable */ + if (page_count == 1) { + 6ce: 83 7c 24 34 01 cmpl $0x1,0x34(%esp,1) + 6d3: 75 1d jne 6f2 <_pagebuf_lookup_pages+0x29a> + pb->pb_addr = (caddr_t) + 6d5: 8b 87 84 00 00 00 mov 0x84(%edi),%eax + 6db: 8b 10 mov (%eax),%edx + page_address(pb->pb_pages[0]) + pb->pb_offset; + pb->pb_flags |= PBF_MAPPED; + 6dd: 83 c9 04 or $0x4,%ecx + 6e0: 0f b7 87 80 00 00 00 movzwl 0x80(%edi),%eax + 6e7: 03 42 2c add 0x2c(%edx),%eax + 6ea: 89 47 34 mov %eax,0x34(%edi) + 6ed: 89 4f 08 mov %ecx,0x8(%edi) + } else if (flags & PBF_MAPPED) { + 6f0: eb 54 jmp 746 <_pagebuf_lookup_pages+0x2ee> + 6f2: 8b 5c 24 44 mov 0x44(%esp,1),%ebx + 6f6: f6 c3 04 test $0x4,%bl + 6f9: 74 4b je 746 <_pagebuf_lookup_pages+0x2ee> + if (as_list_len > 64) + 6fb: 83 3d e4 05 00 00 40 cmpl $0x40,0x5e4 + 702: 7e 05 jle 709 <_pagebuf_lookup_pages+0x2b1> + purge_addresses(); + 704: e8 cb f9 ff ff call d4 + pb->pb_addr = remap_page_array(pb->pb_pages, + 709: 8b 44 24 30 mov 0x30(%esp,1),%eax + 70d: 50 push %eax + 70e: 8b 54 24 38 mov 0x38(%esp,1),%edx + 712: 52 push %edx + 713: 8b 87 84 00 00 00 mov 0x84(%edi),%eax + 719: 50 push %eax + 71a: e8 fc ff ff ff call 71b <_pagebuf_lookup_pages+0x2c3> + 71f: 89 c2 mov %eax,%edx + 721: 89 57 34 mov %edx,0x34(%edi) + page_count, gfp_mask); + if (!pb->pb_addr) + 724: 83 c4 0c add $0xc,%esp + 727: 85 d2 test %edx,%edx + 729: 75 08 jne 733 <_pagebuf_lookup_pages+0x2db> + BUG(); + 72b: 0f 0b ud2a + 72d: 48 dec %eax + 72e: 02 34 00 add (%eax,%eax,1),%dh + 731: 00 00 add %al,(%eax) + pb->pb_addr += pb->pb_offset; + 733: 0f b7 87 80 00 00 00 movzwl 0x80(%edi),%eax + 73a: 01 c2 add %eax,%edx + 73c: 89 57 34 mov %edx,0x34(%edi) + pb->pb_flags |= PBF_MAPPED | _PBF_ADDR_ALLOCATED; + 73f: 81 4f 08 04 00 00 01 orl $0x1000004,0x8(%edi) + } + } + /* If some pages were found with data in them + * we are not in PBF_NONE state. + */ + if (good_pages != 0) { + 746: 83 7c 24 20 00 cmpl $0x0,0x20(%esp,1) + 74b: 74 17 je 764 <_pagebuf_lookup_pages+0x30c> + pb->pb_flags &= ~(PBF_NONE); + 74d: 8b 47 08 mov 0x8(%edi),%eax + 750: 24 df and $0xdf,%al + 752: 89 47 08 mov %eax,0x8(%edi) + if (good_pages != page_count) { + 755: 8b 4c 24 34 mov 0x34(%esp,1),%ecx + 759: 39 4c 24 20 cmp %ecx,0x20(%esp,1) + 75d: 74 05 je 764 <_pagebuf_lookup_pages+0x30c> + pb->pb_flags |= PBF_PARTIAL; + 75f: 0c 08 or $0x8,%al + 761: 89 47 08 mov %eax,0x8(%edi) + } + } + + PB_TRACE(pb, PB_TRACE_REC(look_pg), good_pages); + + return rval; + 764: 8b 44 24 28 mov 0x28(%esp,1),%eax + 768: 5b pop %ebx + 769: 5e pop %esi + 76a: 5f pop %edi + 76b: 5d pop %ebp + 76c: 83 c4 28 add $0x28,%esp + 76f: c3 ret + +0000000000000770 <_pagebuf_prealloc_bh>: +} + + +/* + * Pre-allocation of a pool of buffer heads for use in + * low-memory situations. + */ + +/* + * _pagebuf_prealloc_bh + * + * Pre-allocate a pool of "count" buffer heads at startup. + * Puts them on a list at "pb_resv_bh" + * Returns number of bh actually allocated to pool. + */ +STATIC int +_pagebuf_prealloc_bh( + int count) +{ + 770: 56 push %esi + 771: 53 push %ebx + 772: 8b 74 24 0c mov 0xc(%esp,1),%esi + struct buffer_head *bh; + int i; + + for (i = 0; i < count; i++) { + 776: 31 db xor %ebx,%ebx + 778: eb 21 jmp 79b <_pagebuf_prealloc_bh+0x2b> + 77a: 8d b6 00 00 00 00 lea 0x0(%esi),%esi + bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL); + if (!bh) + break; + bh->b_pprev = &pb_resv_bh; + bh->b_next = pb_resv_bh; + 780: a1 00 00 00 00 mov 0x0,%eax + 785: c7 42 30 00 00 00 00 movl $0x0,0x30(%edx) + 78c: 89 02 mov %eax,(%edx) + pb_resv_bh = bh; + 78e: 89 15 00 00 00 00 mov %edx,0x0 + pb_resv_bh_cnt++; + 794: ff 05 00 00 00 00 incl 0x0 + 79a: 43 inc %ebx + 79b: 39 f3 cmp %esi,%ebx + 79d: 7d 19 jge 7b8 <_pagebuf_prealloc_bh+0x48> + 79f: a1 00 00 00 00 mov 0x0,%eax + 7a4: 68 f0 01 00 00 push $0x1f0 + 7a9: 50 push %eax + 7aa: e8 fc ff ff ff call 7ab <_pagebuf_prealloc_bh+0x3b> + 7af: 89 c2 mov %eax,%edx + 7b1: 83 c4 08 add $0x8,%esp + 7b4: 85 d2 test %edx,%edx + 7b6: 75 c8 jne 780 <_pagebuf_prealloc_bh+0x10> + } + return i; + 7b8: 89 d8 mov %ebx,%eax + 7ba: 5b pop %ebx + 7bb: 5e pop %esi + 7bc: c3 ret +} + 7bd: 8d 76 00 lea 0x0(%esi),%esi + +00000000000007c0 <_pagebuf_get_prealloc_bh>: + +static inline struct task_struct * get_current(void) +{ + struct task_struct *current; + __asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL)); + 7c0: 83 ec 14 sub $0x14,%esp + 7c3: 55 push %ebp + 7c4: 57 push %edi + 7c5: bf 00 e0 ff ff mov $0xffffe000,%edi + 7ca: 21 e7 and %esp,%edi + 7cc: 56 push %esi + 7cd: 53 push %ebx + +/* + * _pagebuf_get_prealloc_bh + * + * Get one buffer head from our pre-allocated pool. + * If pool is empty, sleep 'til one comes back in. + * Returns aforementioned buffer head. + */ +STATIC struct buffer_head * +_pagebuf_get_prealloc_bh(void) +{ + unsigned long flags; + struct buffer_head *bh = NULL; + struct task_struct *tsk = current; + DECLARE_WAITQUEUE (wait, tsk); + 7ce: c7 44 24 14 00 00 00 movl $0x0,0x14(%esp,1) + 7d5: 00 + 7d6: c7 44 24 1c 00 00 00 movl $0x0,0x1c(%esp,1) + 7dd: 00 + 7de: c7 44 24 20 00 00 00 movl $0x0,0x20(%esp,1) + 7e5: 00 + 7e6: 89 7c 24 18 mov %edi,0x18(%esp,1) + + spin_lock_irqsave(&pb_resv_bh_lock, flags); + 7ea: 9c pushf + 7eb: 5e pop %esi + 7ec: fa cli + return oldval > 0; +} + +static inline void spin_lock(spinlock_t *lock) +{ + 7ed: bd 00 00 00 00 mov $0x0,%ebp + 7f2: 89 eb mov %ebp,%ebx +#if SPINLOCK_DEBUG + __label__ here; +here: + if (lock->magic != SPINLOCK_MAGIC) { + 7f4: 81 3d 04 00 00 00 ad cmpl $0xdead4ead,0x4 + 7fb: 4e ad de + 7fe: 74 1a je 81a <_pagebuf_get_prealloc_bh+0x5a> +printk("eip: %p\n", &&here); + 800: 68 f4 07 00 00 push $0x7f4 + 805: 68 2b 00 00 00 push $0x2b + 80a: e8 fc ff ff ff call 80b <_pagebuf_get_prealloc_bh+0x4b> + BUG(); + 80f: 0f 0b ud2a + 811: 85 00 test %eax,(%eax) + 813: 00 00 add %al,(%eax) + 815: 00 00 add %al,(%eax) + } + 817: 83 c4 08 add $0x8,%esp +#endif + __asm__ __volatile__( + 81a: f0 fe 0b lock decb (%ebx) + 81d: 0f 88 c6 25 00 00 js 2de9 + + if (pb_resv_bh_cnt < 1) { + 823: 83 3d 00 00 00 00 00 cmpl $0x0,0x0 + 82a: 0f 8f d4 00 00 00 jg 904 <_pagebuf_get_prealloc_bh+0x144> + + add_wait_queue(&pb_resv_bh_wait, &wait); + 830: 8d 5c 24 14 lea 0x14(%esp,1),%ebx + 834: 89 da mov %ebx,%edx + 836: b8 40 03 00 00 mov $0x340,%eax + 83b: e8 fc ff ff ff call 83c <_pagebuf_get_prealloc_bh+0x7c> + 840: 89 5c 24 10 mov %ebx,0x10(%esp,1) +extern void __run_task_queue(task_queue *list); + +static inline void run_task_queue(task_queue *list) +{ + if (TQ_ACTIVE(*list)) + 844: 81 3d 00 00 00 00 00 cmpl $0x0,0x0 + 84b: 00 00 00 + 84e: 74 0d je 85d <_pagebuf_get_prealloc_bh+0x9d> + __run_task_queue(list); + 850: 68 00 00 00 00 push $0x0 + 855: e8 fc ff ff ff call 856 <_pagebuf_get_prealloc_bh+0x96> + 85a: 83 c4 04 add $0x4,%esp + do { + run_task_queue(&tq_disk); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + 85d: b8 02 00 00 00 mov $0x2,%eax + :"m" (*__xg(ptr)), "0" (x) + :"memory"); + break; + case 4: + __asm__ __volatile__("xchgl %0,%1" + 862: 87 07 xchg %eax,(%edi) + :"0" (oldval) : "memory" + +static inline void spin_unlock(spinlock_t *lock) +{ + char oldval = 1; + 864: b2 01 mov $0x1,%dl +#if SPINLOCK_DEBUG + if (lock->magic != SPINLOCK_MAGIC) + 866: 81 3d 04 00 00 00 ad cmpl $0xdead4ead,0x4 + 86d: 4e ad de + 870: 74 0e je 880 <_pagebuf_get_prealloc_bh+0xc0> + BUG(); + 872: 0f 0b ud2a + 874: 69 00 00 00 00 00 imul $0x0,(%eax),%eax + 87a: 8d b6 00 00 00 00 lea 0x0(%esi),%esi + if (!spin_is_locked(lock)) + 880: a0 00 00 00 00 mov 0x0,%al + 885: 84 c0 test %al,%al + 887: 7e 08 jle 891 <_pagebuf_get_prealloc_bh+0xd1> + BUG(); + 889: 0f 0b ud2a + 88b: 6b 00 00 imul $0x0,(%eax),%eax + 88e: 00 00 add %al,(%eax) + 890: 00 86 15 00 00 00 add %al,0x15(%esi) +#endif + __asm__ __volatile__( + 896: 00 56 9d add %dl,0xffffff9d(%esi) + spin_unlock_irqrestore(&pb_resv_bh_lock, flags); + schedule(); + 899: e8 fc ff ff ff call 89a <_pagebuf_get_prealloc_bh+0xda> + spin_lock_irqsave(&pb_resv_bh_lock, flags); + 89e: 9c pushf + 89f: 5e pop %esi + 8a0: fa cli + return oldval > 0; +} + +static inline void spin_lock(spinlock_t *lock) +{ + 8a1: 89 eb mov %ebp,%ebx +#if SPINLOCK_DEBUG + __label__ here; +here: + if (lock->magic != SPINLOCK_MAGIC) { + 8a3: 81 3d 04 00 00 00 ad cmpl $0xdead4ead,0x4 + 8aa: 4e ad de + 8ad: 74 1a je 8c9 <_pagebuf_get_prealloc_bh+0x109> +printk("eip: %p\n", &&here); + 8af: 68 a3 08 00 00 push $0x8a3 + 8b4: 68 2b 00 00 00 push $0x2b + 8b9: e8 fc ff ff ff call 8ba <_pagebuf_get_prealloc_bh+0xfa> + BUG(); + 8be: 0f 0b ud2a + 8c0: 85 00 test %eax,(%eax) + 8c2: 00 00 add %al,(%eax) + 8c4: 00 00 add %al,(%eax) + } + 8c6: 83 c4 08 add $0x8,%esp +#endif + __asm__ __volatile__( + 8c9: f0 fe 0b lock decb (%ebx) + 8cc: 0f 88 23 25 00 00 js 2df5 + } while (pb_resv_bh_cnt < 1); + 8d2: 83 3d 00 00 00 00 00 cmpl $0x0,0x0 + 8d9: 0f 8e 65 ff ff ff jle 844 <_pagebuf_get_prealloc_bh+0x84> + tsk->state = TASK_RUNNING; + 8df: c7 07 00 00 00 00 movl $0x0,(%edi) + remove_wait_queue(&pb_resv_bh_wait, &wait); + 8e5: 8b 54 24 10 mov 0x10(%esp,1),%edx + 8e9: b8 40 03 00 00 mov $0x340,%eax + 8ee: e8 fc ff ff ff call 8ef <_pagebuf_get_prealloc_bh+0x12f> + } + + if (pb_resv_bh_cnt < 1) + 8f3: 83 3d 00 00 00 00 00 cmpl $0x0,0x0 + 8fa: 7f 08 jg 904 <_pagebuf_get_prealloc_bh+0x144> + BUG(); + 8fc: 0f 0b ud2a + 8fe: 9c pushf + 8ff: 02 34 00 add (%eax,%eax,1),%dh + 902: 00 00 add %al,(%eax) + + bh = pb_resv_bh; + 904: 8b 0d 00 00 00 00 mov 0x0,%ecx + + if (!bh) + 90a: 85 c9 test %ecx,%ecx + 90c: 75 08 jne 916 <_pagebuf_get_prealloc_bh+0x156> + BUG(); + 90e: 0f 0b ud2a + 910: a1 02 34 00 00 mov 0x3402,%eax + 915: 00 8b 01 a3 00 00 add %cl,0xa301(%ebx) + + pb_resv_bh = bh->b_next; + 91b: 00 00 add %al,(%eax) + bh->b_state = 0; + 91d: c7 41 18 00 00 00 00 movl $0x0,0x18(%ecx) + pb_resv_bh_cnt--; + 924: ff 0d 00 00 00 00 decl 0x0 + :"0" (oldval) : "memory" + +static inline void spin_unlock(spinlock_t *lock) +{ + char oldval = 1; + 92a: b2 01 mov $0x1,%dl +#if SPINLOCK_DEBUG + if (lock->magic != SPINLOCK_MAGIC) + 92c: 81 3d 04 00 00 00 ad cmpl $0xdead4ead,0x4 + 933: 4e ad de + 936: 74 08 je 940 <_pagebuf_get_prealloc_bh+0x180> + BUG(); + 938: 0f 0b ud2a + 93a: 69 00 00 00 00 00 imul $0x0,(%eax),%eax + if (!spin_is_locked(lock)) + 940: a0 00 00 00 00 mov 0x0,%al + 945: 84 c0 test %al,%al + 947: 7e 08 jle 951 <_pagebuf_get_prealloc_bh+0x191> + BUG(); + 949: 0f 0b ud2a + 94b: 6b 00 00 imul $0x0,(%eax),%eax + 94e: 00 00 add %al,(%eax) + 950: 00 86 15 00 00 00 add %al,0x15(%esi) +#endif + __asm__ __volatile__( + 956: 00 56 9d add %dl,0xffffff9d(%esi) + + spin_unlock_irqrestore(&pb_resv_bh_lock, flags); + + return bh; + 959: 89 c8 mov %ecx,%eax + 95b: 5b pop %ebx + 95c: 5e pop %esi + 95d: 5f pop %edi + 95e: 5d pop %ebp + 95f: 83 c4 14 add $0x14,%esp + 962: c3 ret +} + 963: 90 nop + +0000000000000964 <_pagebuf_find>: + +/* + * _pagebuf_free_bh + * + * Take care of buffer heads that we're finished with. + * Call this instead of just kmem_cache_free(bh_cachep, bh) + * when you're done with a bh. + * + * If our pre-allocated pool is full, just free the buffer head. + * Otherwise, put it back in the pool, and wake up anybody + * waiting for one. + */ +STATIC inline void +_pagebuf_free_bh( + struct buffer_head *bh) +{ + unsigned long flags; + + if (pb_resv_bh_cnt == NR_RESERVED_BH){ + kmem_cache_free(bh_cachep, bh); + } else { + spin_lock_irqsave(&pb_resv_bh_lock, flags); + + bh->b_pprev = &pb_resv_bh; + bh->b_next = pb_resv_bh; + pb_resv_bh = bh; + pb_resv_bh_cnt++; + + if (waitqueue_active(&pb_resv_bh_wait)) { + wake_up(&pb_resv_bh_wait); + } + + spin_unlock_irqrestore(&pb_resv_bh_lock, flags); + } +} + +/* + * Finding and Reading Buffers + */ + +/* + * _pagebuf_find + * + * Looks up, and creates if absent, a lockable buffer for + * a given range of an inode. The buffer is returned + * locked. If other overlapping buffers exist, they are + * released before the new buffer is created and locked, + * which may imply that this call will block until those buffers + * are unlocked. No I/O is implied by this call. + */ +STATIC page_buf_t * +_pagebuf_find( /* find buffer for block */ + pb_target_t *target,/* target for block */ + loff_t ioff, /* starting offset of range */ + size_t isize, /* length of range */ + page_buf_flags_t flags, /* PBF_TRYLOCK */ + page_buf_t *new_pb)/* newly allocated buffer */ +{ + 964: 83 ec 0c sub $0xc,%esp + 967: 55 push %ebp + 968: 57 push %edi + 969: 56 push %esi + 96a: 53 push %ebx + 96b: 8b 74 24 24 mov 0x24(%esp,1),%esi + 96f: 8b 7c 24 28 mov 0x28(%esp,1),%edi + loff_t range_base; + size_t range_length; + int hval; + pb_hash_t *h; + struct list_head *p; + page_buf_t *pb; + int not_locked; + + range_base = (ioff << SECTOR_SHIFT); + range_length = (isize << SECTOR_SHIFT); + 973: 8b 44 24 2c mov 0x2c(%esp,1),%eax + 977: 0f a4 f7 09 shld $0x9,%esi,%edi + 97b: c1 e6 09 shl $0x9,%esi + 97e: c1 e0 09 shl $0x9,%eax + 981: 89 44 24 18 mov %eax,0x18(%esp,1) + + hval = _bhash(target->pbr_bdev->bd_dev, range_base); + 985: 57 push %edi + 986: 56 push %esi + 987: 8b 54 24 28 mov 0x28(%esp,1),%edx + 98b: 8b 42 08 mov 0x8(%edx),%eax + 98e: 0f b7 40 10 movzwl 0x10(%eax),%eax + 992: 50 push %eax + 993: e8 68 f6 ff ff call 0 <_bhash> + 998: 89 44 24 20 mov %eax,0x20(%esp,1) + h = &pbhash[hval]; + 99c: 8d 04 80 lea (%eax,%eax,4),%eax + 99f: 8d 2c 85 60 03 00 00 lea 0x360(,%eax,4),%ebp + return oldval > 0; +} + +static inline void spin_lock(spinlock_t *lock) +{ + 9a6: 8d 5d 0c lea 0xc(%ebp),%ebx + 9a9: 83 c4 0c add $0xc,%esp +#if SPINLOCK_DEBUG + __label__ here; +here: + if (lock->magic != SPINLOCK_MAGIC) { + 9ac: 81 7b 04 ad 4e ad de cmpl $0xdead4ead,0x4(%ebx) + 9b3: 74 1a je 9cf <_pagebuf_find+0x6b> +printk("eip: %p\n", &&here); + 9b5: 68 ac 09 00 00 push $0x9ac + 9ba: 68 2b 00 00 00 push $0x2b + 9bf: e8 fc ff ff ff call 9c0 <_pagebuf_find+0x5c> + BUG(); + 9c4: 0f 0b ud2a + 9c6: 85 00 test %eax,(%eax) + 9c8: 00 00 add %al,(%eax) + 9ca: 00 00 add %al,(%eax) + } + 9cc: 83 c4 08 add $0x8,%esp +#endif + __asm__ __volatile__( + 9cf: f0 fe 0b lock decb (%ebx) + 9d2: 0f 88 29 24 00 00 js 2e01 + + spin_lock(&h->pb_hash_lock); + list_for_each(p, &h->pb_hash) { + 9d8: 8b 45 00 mov 0x0(%ebp),%eax + 9db: 8d 4d 0c lea 0xc(%ebp),%ecx + 9de: 89 4c 24 10 mov %ecx,0x10(%esp,1) + pb = list_entry(p, page_buf_t, pb_hash_list); + + if ((target == pb->pb_target) && + (pb->pb_file_offset == range_base) && + (pb->pb_buffer_length == range_length)) { + if (pb->pb_flags & PBF_FREED) + 9e2: eb 24 jmp a08 <_pagebuf_find+0xa4> + * @entry: the element to delete from the list. + * Note: list_empty on entry does not return true after this, the entry is in an undefined state. + */ +static __inline__ void list_del(struct list_head *entry) +{ + 9e4: 8d 4b 0c lea 0xc(%ebx),%ecx + 9e7: 8b 51 04 mov 0x4(%ecx),%edx + 9ea: 8b 43 0c mov 0xc(%ebx),%eax + 9ed: 89 50 04 mov %edx,0x4(%eax) + 9f0: 89 02 mov %eax,(%edx) + 9f2: 8b 45 00 mov 0x0(%ebp),%eax + 9f5: 89 48 04 mov %ecx,0x4(%eax) + 9f8: 89 43 0c mov %eax,0xc(%ebx) + 9fb: 89 69 04 mov %ebp,0x4(%ecx) + 9fe: 89 4d 00 mov %ecx,0x0(%ebp) + break; + /* If we look at something bring it to the + * front of the list for next time + */ + list_del(&pb->pb_hash_list); + list_add(&pb->pb_hash_list, &h->pb_hash); + goto found; + a01: e9 ba 00 00 00 jmp ac0 <_pagebuf_find+0x15c> + a06: 8b 00 mov (%eax),%eax + } + } + a08: 39 e8 cmp %ebp,%eax + a0a: 74 25 je a31 <_pagebuf_find+0xcd> + a0c: 8b 54 24 20 mov 0x20(%esp,1),%edx + a10: 8d 58 f4 lea 0xfffffff4(%eax),%ebx + a13: 3b 53 14 cmp 0x14(%ebx),%edx + a16: 75 ee jne a06 <_pagebuf_find+0xa2> + a18: 39 73 24 cmp %esi,0x24(%ebx) + a1b: 75 e9 jne a06 <_pagebuf_find+0xa2> + a1d: 39 7b 28 cmp %edi,0x28(%ebx) + a20: 75 e4 jne a06 <_pagebuf_find+0xa2> + a22: 8b 4c 24 18 mov 0x18(%esp,1),%ecx + a26: 39 4b 2c cmp %ecx,0x2c(%ebx) + a29: 75 db jne a06 <_pagebuf_find+0xa2> + a2b: 80 78 fc 00 cmpb $0x0,0xfffffffc(%eax) + a2f: 7d b3 jge 9e4 <_pagebuf_find+0x80> + + /* No match found */ + if (new_pb) { + a31: 83 7c 24 34 00 cmpl $0x0,0x34(%esp,1) + a36: 74 4a je a82 <_pagebuf_find+0x11e> + _pagebuf_initialize(new_pb, target, range_base, + a38: 80 4c 24 32 08 orb $0x8,0x32(%esp,1) + a3d: 8b 44 24 30 mov 0x30(%esp,1),%eax + a41: 50 push %eax + a42: 8b 54 24 1c mov 0x1c(%esp,1),%edx + a46: 52 push %edx + a47: 57 push %edi + a48: 56 push %esi + a49: 8b 4c 24 30 mov 0x30(%esp,1),%ecx + a4d: 51 push %ecx + a4e: 8b 44 24 48 mov 0x48(%esp,1),%eax + a52: 50 push %eax + a53: e8 28 f7 ff ff call 180 <_pagebuf_initialize> + range_length, flags | _PBF_LOCKABLE); + new_pb->pb_hash_index = hval; + a58: 8b 4c 24 4c mov 0x4c(%esp,1),%ecx + a5c: 8a 54 24 2c mov 0x2c(%esp,1),%dl + a60: 88 91 83 00 00 00 mov %dl,0x83(%ecx) + */ +static __inline__ void __list_add(struct list_head * new, + struct list_head * prev, + struct list_head * next) +{ + a66: 8b 45 00 mov 0x0(%ebp),%eax + h->pb_count++; + a69: ff 45 08 incl 0x8(%ebp) + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static __inline__ void list_add(struct list_head *new, struct list_head *head) +{ + a6c: 89 ca mov %ecx,%edx + a6e: 83 c2 0c add $0xc,%edx + a71: 83 c4 18 add $0x18,%esp + a74: 89 50 04 mov %edx,0x4(%eax) + a77: 89 41 0c mov %eax,0xc(%ecx) + a7a: 89 6a 04 mov %ebp,0x4(%edx) + a7d: 89 55 00 mov %edx,0x0(%ebp) + list_add(&new_pb->pb_hash_list, &h->pb_hash); + } else { + a80: eb 06 jmp a88 <_pagebuf_find+0x124> + PB_STATS_INC(pbstats.pb_miss_locked); + a82: ff 05 14 00 00 00 incl 0x14 +static inline void spin_unlock(spinlock_t *lock) +{ + char oldval = 1; +#if SPINLOCK_DEBUG + if (lock->magic != SPINLOCK_MAGIC) + a88: 8b 44 24 10 mov 0x10(%esp,1),%eax + a8c: b2 01 mov $0x1,%dl + a8e: 81 78 04 ad 4e ad de cmpl $0xdead4ead,0x4(%eax) + a95: 74 08 je a9f <_pagebuf_find+0x13b> + BUG(); + a97: 0f 0b ud2a + a99: 69 00 00 00 00 00 imul $0x0,(%eax),%eax + if (!spin_is_locked(lock)) + a9f: 8a 45 0c mov 0xc(%ebp),%al + aa2: 84 c0 test %al,%al + aa4: 7e 08 jle aae <_pagebuf_find+0x14a> + BUG(); + aa6: 0f 0b ud2a + aa8: 6b 00 00 imul $0x0,(%eax),%eax + aab: 00 00 add %al,(%eax) + aad: 00 86 55 0c 8b 44 add %al,0x448b0c55(%esi) + } + + spin_unlock(&h->pb_hash_lock); + return (new_pb); + ab3: 24 34 and $0x34,%al + ab5: e9 96 00 00 00 jmp b50 <_pagebuf_find+0x1ec> + aba: 8d b6 00 00 00 00 lea 0x0(%esi),%esi + * useful range of an atomic_t is only 24 bits. + */ +static __inline__ void atomic_inc(atomic_t *v) +{ + __asm__ __volatile__( + ac0: f0 ff 43 18 lock incl 0x18(%ebx) +static inline void spin_unlock(spinlock_t *lock) +{ + char oldval = 1; +#if SPINLOCK_DEBUG + if (lock->magic != SPINLOCK_MAGIC) + ac4: 8b 4c 24 10 mov 0x10(%esp,1),%ecx + ac8: b2 01 mov $0x1,%dl + aca: 81 79 04 ad 4e ad de cmpl $0xdead4ead,0x4(%ecx) + ad1: 74 08 je adb <_pagebuf_find+0x177> + BUG(); + ad3: 0f 0b ud2a + ad5: 69 00 00 00 00 00 imul $0x0,(%eax),%eax + if (!spin_is_locked(lock)) + adb: 8a 45 0c mov 0xc(%ebp),%al + ade: 84 c0 test %al,%al + ae0: 7e 08 jle aea <_pagebuf_find+0x186> + BUG(); + ae2: 0f 0b ud2a + ae4: 6b 00 00 imul $0x0,(%eax),%eax + ae7: 00 00 add %al,(%eax) + ae9: 00 86 55 0c 8d 8b add %al,0x8b8d0c55(%esi) + * Non-blockingly attempt to down() a semaphore. + * Returns zero if we acquired it + */ +static inline int down_trylock(struct semaphore * sem) +{ + aef: 98 cwtl + af0: 00 00 add %al,(%eax) + af2: 00 f0 add %dh,%al + int result; + +#if WAITQUEUE_DEBUG + CHECK_MAGIC(sem->__magic); +#endif + + __asm__ __volatile__( + af4: ff 8b 98 00 00 00 decl 0x98(%ebx) + afa: 0f 88 0d 23 00 00 js 2e0d + b00: 31 c0 xor %eax,%eax + +found: + atomic_inc(&pb->pb_hold); + spin_unlock(&h->pb_hash_lock); + + /* Attempt to get the semaphore without sleeping, + * if this does not work then we need to drop the + * spinlock and do a hard attempt on the semaphore. + */ + not_locked = down_trylock(&PBP(pb)->pb_sema); + if (not_locked) { + b02: 85 c0 test %eax,%eax + b04: 74 2b je b31 <_pagebuf_find+0x1cd> + if (!(flags & PBF_TRYLOCK)) { + b06: 8b 44 24 30 mov 0x30(%esp,1),%eax + b0a: f6 c4 40 test $0x40,%ah + b0d: 74 13 je b22 <_pagebuf_find+0x1be> + /* wait for buffer ownership */ + PB_TRACE(pb, PB_TRACE_REC(get_lk), 0); + pagebuf_lock(pb); + PB_STATS_INC(pbstats.pb_get_locked_waited); + } else { + /* We asked for a trylock and failed, no need + * to look at file offset and length here, we + * know that this pagebuf at least overlaps our + * pagebuf and is locked, therefore our buffer + * either does not exist, or is this buffer + */ + + pagebuf_rele(pb); + b0f: 53 push %ebx + b10: e8 fc ff ff ff call b11 <_pagebuf_find+0x1ad> + PB_STATS_INC(pbstats.pb_busy_locked); + return (NULL); + b15: 31 c0 xor %eax,%eax + b17: ff 05 10 00 00 00 incl 0x10 + b1d: 83 c4 04 add $0x4,%esp + b20: eb 2e jmp b50 <_pagebuf_find+0x1ec> + b22: 53 push %ebx + b23: e8 fc ff ff ff call b24 <_pagebuf_find+0x1c0> + b28: ff 05 0c 00 00 00 incl 0xc + b2e: 83 c4 04 add $0x4,%esp + } + } else { + /* trylock worked */ + PB_SET_OWNER(pb); + } + + if (pb->pb_flags & PBF_STALE) + b31: 8b 43 08 mov 0x8(%ebx),%eax + b34: f6 c4 04 test $0x4,%ah + b37: 74 08 je b41 <_pagebuf_find+0x1dd> + pb->pb_flags &= PBF_MAPPABLE | \ + b39: 25 04 02 c8 03 and $0x3c80204,%eax + b3e: 89 43 08 mov %eax,0x8(%ebx) + PBF_MAPPED | \ + _PBF_LOCKABLE | \ + _PBF_ALL_PAGES_MAPPED | \ + _PBF_SOME_INVALID_PAGES | \ + _PBF_ADDR_ALLOCATED | \ + _PBF_MEM_ALLOCATED; + PB_TRACE(pb, PB_TRACE_REC(got_lk), 0); + PB_STATS_INC(pbstats.pb_get_locked); + b41: ff 05 08 00 00 00 incl 0x8 + return (pb); + b47: 89 d8 mov %ebx,%eax +} + b49: 8d b4 26 00 00 00 00 lea 0x0(%esi,1),%esi + b50: 5b pop %ebx + b51: 5e pop %esi + b52: 5f pop %edi + b53: 5d pop %ebp + b54: 83 c4 0c add $0xc,%esp + b57: c3 ret + +0000000000000b58 : + + +/* + * pagebuf_find + * + * pagebuf_find returns a buffer matching the specified range of + * data for the specified target, if any of the relevant blocks + * are in memory. The buffer may have unallocated holes, if + * some, but not all, of the blocks are in memory. Even where + * pages are present in the buffer, not all of every page may be + * valid. The file system may use pagebuf_segment to visit the + * various segments of the buffer. + */ +page_buf_t * +pagebuf_find( /* find buffer for block */ + /* if the block is in memory */ + pb_target_t *target,/* target for block */ + loff_t ioff, /* starting offset of range */ + size_t isize, /* length of range */ + page_buf_flags_t flags) /* PBF_TRYLOCK */ +{ + b58: 8b 4c 24 04 mov 0x4(%esp,1),%ecx + b5c: 8b 54 24 10 mov 0x10(%esp,1),%edx + b60: 8b 44 24 14 mov 0x14(%esp,1),%eax + return _pagebuf_find(target, ioff, isize, flags, NULL); + b64: 6a 00 push $0x0 + b66: 50 push %eax + b67: 52 push %edx + b68: 8b 44 24 14 mov 0x14(%esp,1),%eax + b6c: 8b 54 24 18 mov 0x18(%esp,1),%edx + b70: 52 push %edx + b71: 50 push %eax + b72: 51 push %ecx + b73: e8 ec fd ff ff call 964 <_pagebuf_find> + b78: 83 c4 18 add $0x18,%esp + b7b: c3 ret + +0000000000000b7c : +} + +/* + * pagebuf_get + * + * pagebuf_get assembles a buffer covering the specified range. + * Some or all of the blocks in the range may be valid. The file + * system may use pagebuf_segment to visit the various segments + * of the buffer. Storage in memory for all portions of the + * buffer will be allocated, although backing storage may not be. + * If PBF_READ is set in flags, pagebuf_read + */ +page_buf_t * +pagebuf_get( /* allocate a buffer */ + pb_target_t *target,/* target for buffer */ + loff_t ioff, /* starting offset of range */ + size_t isize, /* length of range */ + page_buf_flags_t flags) /* PBF_TRYLOCK */ +{ + b7c: 57 push %edi + b7d: 56 push %esi + b7e: 53 push %ebx + b7f: 8b 7c 24 20 mov 0x20(%esp,1),%edi + page_buf_t *pb, *new_pb; + int error; + + new_pb = pagebuf_allocate(flags); + b83: f7 c7 00 00 00 20 test $0x20000000,%edi + b89: 75 18 jne ba3 + b8b: 89 f8 mov %edi,%eax + b8d: 25 00 00 02 00 and $0x20000,%eax + b92: b9 f0 01 00 00 mov $0x1f0,%ecx + b97: ba f0 00 00 00 mov $0xf0,%edx + b9c: 85 c0 test %eax,%eax + b9e: 0f 45 ca cmovne %edx,%ecx + ba1: eb 02 jmp ba5 + ba3: 31 c9 xor %ecx,%ecx + ba5: a1 24 00 00 00 mov 0x24,%eax + baa: 51 push %ecx + bab: 50 push %eax + bac: e8 fc ff ff ff call bad + bb1: 89 c6 mov %eax,%esi + if (unlikely(!new_pb)) + bb3: 83 c4 08 add $0x8,%esp + bb6: 85 f6 test %esi,%esi + bb8: 74 37 je bf1 + return (NULL); + + pb = _pagebuf_find(target, ioff, isize, flags, new_pb); + bba: 56 push %esi + bbb: 57 push %edi + bbc: 8b 44 24 24 mov 0x24(%esp,1),%eax + bc0: 50 push %eax + bc1: 8b 44 24 20 mov 0x20(%esp,1),%eax + bc5: 8b 54 24 24 mov 0x24(%esp,1),%edx + bc9: 52 push %edx + bca: 50 push %eax + bcb: 8b 44 24 24 mov 0x24(%esp,1),%eax + bcf: 50 push %eax + bd0: e8 8f fd ff ff call 964 <_pagebuf_find> + bd5: 89 c3 mov %eax,%ebx + if (pb != new_pb) { + bd7: 83 c4 18 add $0x18,%esp + bda: 39 f3 cmp %esi,%ebx + bdc: 74 1a je bf8 + pagebuf_deallocate(new_pb); + bde: a1 24 00 00 00 mov 0x24,%eax + be3: 56 push %esi + be4: 50 push %eax + be5: e8 fc ff ff ff call be6 + if (unlikely(!pb)) + bea: 83 c4 08 add $0x8,%esp + bed: 85 db test %ebx,%ebx + bef: 75 07 jne bf8 + return (NULL); + bf1: 31 c0 xor %eax,%eax + bf3: e9 8f 00 00 00 jmp c87 + } + + PB_STATS_INC(pbstats.pb_get); + bf8: ff 05 00 00 00 00 incl 0x0 + + /* fill in any missing pages */ + error = _pagebuf_lookup_pages(pb, pb->pb_target->pbr_mapping, flags); + bfe: 57 push %edi + bff: 8b 43 14 mov 0x14(%ebx),%eax + c02: 8b 40 0c mov 0xc(%eax),%eax + c05: 50 push %eax + c06: 53 push %ebx + c07: e8 4c f8 ff ff call 458 <_pagebuf_lookup_pages> + if (unlikely(error)) { + c0c: 83 c4 0c add $0xc,%esp + c0f: 85 c0 test %eax,%eax + c11: 74 0d je c20 + pagebuf_free(pb); + c13: 53 push %ebx + c14: e8 fc ff ff ff call c15 + return (NULL); + c19: eb 59 jmp c74 + c1b: 90 nop + c1c: 8d 74 26 00 lea 0x0(%esi,1),%esi + } + + /* + * Always fill in the block number now, the mapped cases can do + * their own overlay of this later. + */ + pb->pb_bn = ioff; + c20: 8b 44 24 14 mov 0x14(%esp,1),%eax + c24: 8b 54 24 18 mov 0x18(%esp,1),%edx + c28: 89 43 1c mov %eax,0x1c(%ebx) + pb->pb_count_desired = pb->pb_buffer_length; + c2b: 8b 43 2c mov 0x2c(%ebx),%eax + c2e: 89 53 20 mov %edx,0x20(%ebx) + c31: 89 43 30 mov %eax,0x30(%ebx) + + if (flags & PBF_READ) { + c34: f7 c7 01 00 00 00 test $0x1,%edi + c3a: 74 49 je c85 + if (PBF_NOT_DONE(pb)) { + c3c: 8b 43 08 mov 0x8(%ebx),%eax + c3f: a8 28 test $0x28,%al + c41: 74 12 je c55 + PB_TRACE(pb, PB_TRACE_REC(get_read), flags); + PB_STATS_INC(pbstats.pb_get_read); + c43: ff 05 20 00 00 00 incl 0x20 + pagebuf_iostart(pb, flags); + c49: 57 push %edi + c4a: 53 push %ebx + c4b: e8 fc ff ff ff call c4c + } else if (flags & PBF_ASYNC) { + c50: 83 c4 08 add $0x8,%esp + c53: eb 30 jmp c85 + c55: f7 c7 10 00 00 00 test $0x10,%edi + c5b: 74 23 je c80 + /* + * Read ahead call which is already satisfied, + * drop the buffer + */ + if (flags & (PBF_LOCK | PBF_TRYLOCK)) + c5d: f7 c7 00 60 00 00 test $0x6000,%edi + c63: 74 09 je c6e + pagebuf_unlock(pb); + c65: 53 push %ebx + c66: e8 fc ff ff ff call c67 + c6b: 83 c4 04 add $0x4,%esp + pagebuf_rele(pb); + c6e: 53 push %ebx + c6f: e8 fc ff ff ff call c70 + return NULL; + c74: 31 c0 xor %eax,%eax + c76: 83 c4 04 add $0x4,%esp + c79: eb 0c jmp c87 + c7b: 90 nop + c7c: 8d 74 26 00 lea 0x0(%esi,1),%esi + } else { + /* We do not want read in the flags */ + pb->pb_flags &= ~PBF_READ; + c80: 24 fe and $0xfe,%al + c82: 89 43 08 mov %eax,0x8(%ebx) + } + } + + PB_TRACE(pb, PB_TRACE_REC(get_obj), flags); + return (pb); + c85: 89 d8 mov %ebx,%eax +} + c87: 5b pop %ebx + c88: 5e pop %esi + c89: 5f pop %edi + c8a: c3 ret + c8b: 90 nop + +0000000000000c8c : + +/* + * Create a pagebuf and populate it with pages from the address + * space of the passed in inode. + */ +page_buf_t * +pagebuf_lookup( + struct pb_target *target, + struct inode *inode, + loff_t ioff, + size_t isize, + int flags) +{ + page_buf_t *pb = NULL; + int status; + + flags |= _PBF_PRIVATE_BH; + c8c: 56 push %esi + c8d: 53 push %ebx + c8e: 8b 74 24 20 mov 0x20(%esp,1),%esi + c92: 81 ce 00 00 10 00 or $0x100000,%esi + pb = pagebuf_allocate(flags); + c98: f7 c6 00 00 00 20 test $0x20000000,%esi + c9e: 75 18 jne cb8 + ca0: 89 f0 mov %esi,%eax + ca2: 25 00 00 02 00 and $0x20000,%eax + ca7: b9 f0 01 00 00 mov $0x1f0,%ecx + cac: ba f0 00 00 00 mov $0xf0,%edx + cb1: 85 c0 test %eax,%eax + cb3: 0f 45 ca cmovne %edx,%ecx + cb6: eb 02 jmp cba + cb8: 31 c9 xor %ecx,%ecx + cba: a1 24 00 00 00 mov 0x24,%eax + cbf: 51 push %ecx + cc0: 50 push %eax + cc1: e8 fc ff ff ff call cc2 + cc6: 89 c3 mov %eax,%ebx + if (pb) { + cc8: 83 c4 08 add $0x8,%esp + ccb: 85 db test %ebx,%ebx + ccd: 74 51 je d20 + _pagebuf_initialize(pb, target, ioff, isize, flags); + ccf: 56 push %esi + cd0: 8b 44 24 20 mov 0x20(%esp,1),%eax + cd4: 50 push %eax + cd5: 8b 44 24 1c mov 0x1c(%esp,1),%eax + cd9: 8b 54 24 20 mov 0x20(%esp,1),%edx + cdd: 52 push %edx + cde: 50 push %eax + cdf: 8b 44 24 1c mov 0x1c(%esp,1),%eax + ce3: 50 push %eax + ce4: 53 push %ebx + ce5: e8 96 f4 ff ff call 180 <_pagebuf_initialize> + if (flags & PBF_ENTER_PAGES) { + cea: 83 c4 18 add $0x18,%esp + ced: f7 c6 00 00 20 00 test $0x200000,%esi + cf3: 74 2b je d20 + status = _pagebuf_lookup_pages(pb, &inode->i_data, 0); + cf5: 6a 00 push $0x0 + cf7: 8b 44 24 14 mov 0x14(%esp,1),%eax + cfb: 05 c0 00 00 00 add $0xc0,%eax + d00: 50 push %eax + d01: 53 push %ebx + d02: e8 51 f7 ff ff call 458 <_pagebuf_lookup_pages> + if (status != 0) { + d07: 83 c4 0c add $0xc,%esp + d0a: 85 c0 test %eax,%eax + d0c: 74 12 je d20 + pagebuf_free(pb); + d0e: 53 push %ebx + d0f: e8 fc ff ff ff call d10 + return (NULL); + d14: 31 c0 xor %eax,%eax + d16: 83 c4 04 add $0x4,%esp + d19: eb 07 jmp d22 + d1b: 90 nop + d1c: 8d 74 26 00 lea 0x0(%esi,1),%esi + } + } + } + return pb; + d20: 89 d8 mov %ebx,%eax +} + d22: 5b pop %ebx + d23: 5e pop %esi + d24: c3 ret + d25: 8d 76 00 lea 0x0(%esi),%esi + +0000000000000d28 : + +/* + * If we are not low on memory then do the readahead in a deadlock + * safe manner. + */ +void +pagebuf_readahead( + pb_target_t *target, + loff_t ioff, + size_t isize, + int flags) +{ + d28: 8b 4c 24 04 mov 0x4(%esp,1),%ecx + d2c: 8b 54 24 10 mov 0x10(%esp,1),%edx + flags |= (PBF_TRYLOCK|PBF_READ|PBF_ASYNC|PBF_MAPPABLE|PBF_READ_AHEAD); + d30: 8b 44 24 14 mov 0x14(%esp,1),%eax + d34: 0d 11 42 00 20 or $0x20004211,%eax + pagebuf_get(target, ioff, isize, flags); + d39: 50 push %eax + d3a: 52 push %edx + d3b: 8b 44 24 10 mov 0x10(%esp,1),%eax + d3f: 8b 54 24 14 mov 0x14(%esp,1),%edx + d43: 52 push %edx + d44: 50 push %eax + d45: 51 push %ecx + d46: e8 fc ff ff ff call d47 +} + d4b: 83 c4 14 add $0x14,%esp + d4e: c3 ret + d4f: 90 nop + +0000000000000d50 : + +page_buf_t * +pagebuf_get_empty( + pb_target_t *target) +{ + page_buf_t *pb; + + pb = pagebuf_allocate(_PBF_LOCKABLE); + d50: a1 24 00 00 00 mov 0x24,%eax + d55: 53 push %ebx + d56: 68 f0 01 00 00 push $0x1f0 + d5b: 50 push %eax + d5c: e8 fc ff ff ff call d5d + d61: 89 c3 mov %eax,%ebx + if (pb) + d63: 83 c4 08 add $0x8,%esp + d66: 85 db test %ebx,%ebx + d68: 74 19 je d83 + _pagebuf_initialize(pb, target, 0, 0, _PBF_LOCKABLE); + d6a: 68 00 00 08 00 push $0x80000 + d6f: 6a 00 push $0x0 + d71: 6a 00 push $0x0 + d73: 6a 00 push $0x0 + d75: 8b 44 24 18 mov 0x18(%esp,1),%eax + d79: 50 push %eax + d7a: 53 push %ebx + d7b: e8 00 f4 ff ff call 180 <_pagebuf_initialize> + d80: 83 c4 18 add $0x18,%esp + return pb; + d83: 89 d8 mov %ebx,%eax + d85: 5b pop %ebx + d86: c3 ret +} + d87: 90 nop + +0000000000000d88 : + +static inline struct page * +mem_to_page( + void *addr) +{ + if (((unsigned long)addr < VMALLOC_START) || + ((unsigned long)addr >= VMALLOC_END)) { + return virt_to_page(addr); + } else { + return vmalloc_to_page(addr); + } +} + +int +pagebuf_associate_memory( + page_buf_t *pb, + void *mem, + size_t len) +{ + d88: 55 push %ebp + d89: 57 push %edi + d8a: 56 push %esi + d8b: 53 push %ebx + d8c: 8b 7c 24 14 mov 0x14(%esp,1),%edi + int rval; + int i = 0; + size_t ptr; + size_t end, end_cur; + off_t offset; + int page_count; + + page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT; + d90: 8b 5c 24 1c mov 0x1c(%esp,1),%ebx + offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK); + d94: 8b 74 24 18 mov 0x18(%esp,1),%esi + d98: 81 c3 ff 0f 00 00 add $0xfff,%ebx + d9e: c1 eb 0c shr $0xc,%ebx + da1: 89 f5 mov %esi,%ebp + da3: 81 e5 ff 0f 00 00 and $0xfff,%ebp + if (offset && (len > PAGE_CACHE_SIZE)) + da9: 74 0e je db9 + dab: 8d 43 01 lea 0x1(%ebx),%eax + dae: 81 7c 24 1c 01 10 00 cmpl $0x1001,0x1c(%esp,1) + db5: 00 + db6: 0f 43 d8 cmovae %eax,%ebx + page_count++; + + /* Free any previous set of page pointers */ + if (pb->pb_pages && (pb->pb_pages != pb->pb_page_array)) { + db9: 8b 97 84 00 00 00 mov 0x84(%edi),%edx + dbf: 85 d2 test %edx,%edx + dc1: 74 13 je dd6 + dc3: 8d 87 88 00 00 00 lea 0x88(%edi),%eax + dc9: 39 c2 cmp %eax,%edx + dcb: 74 09 je dd6 + kfree(pb->pb_pages); + dcd: 52 push %edx + dce: e8 fc ff ff ff call dcf + } + dd3: 83 c4 04 add $0x4,%esp + pb->pb_pages = NULL; + dd6: c7 87 84 00 00 00 00 movl $0x0,0x84(%edi) + ddd: 00 00 00 + pb->pb_addr = mem; + de0: 89 77 34 mov %esi,0x34(%edi) + + rval = _pagebuf_get_pages(pb, page_count, 0); + de3: 6a 00 push $0x0 + de5: 53 push %ebx + de6: 57 push %edi + de7: e8 b8 f4 ff ff call 2a4 <_pagebuf_get_pages> + if (rval) + dec: 83 c4 0c add $0xc,%esp + def: 85 c0 test %eax,%eax + df1: 0f 85 f6 00 00 00 jne eed + return rval; + + pb->pb_offset = offset; + df7: 66 89 af 80 00 00 00 mov %bp,0x80(%edi) + ptr = (size_t) mem & PAGE_CACHE_MASK; + end = PAGE_CACHE_ALIGN((size_t) mem + len); + dfe: 8b 44 24 1c mov 0x1c(%esp,1),%eax + e02: 8d ac 30 ff 0f 00 00 lea 0xfff(%eax,%esi,1),%ebp + e09: a1 00 00 00 00 mov 0x0,%eax + e0e: 89 f3 mov %esi,%ebx + e10: 81 e3 00 f0 ff ff and $0xfffff000,%ebx + e16: 81 e5 00 f0 ff ff and $0xfffff000,%ebp + e1c: 05 ff ff ff 00 add $0xffffff,%eax + e21: 25 00 00 80 ff and $0xff800000,%eax + e26: 39 c6 cmp %eax,%esi + e28: 72 08 jb e32 + e2a: 81 fe ff df ff fd cmp $0xfdffdfff,%esi + e30: 76 1e jbe e50 + e32: 8d 86 00 00 00 40 lea 0x40000000(%esi),%eax + e38: c1 e8 0c shr $0xc,%eax + e3b: 8d 14 40 lea (%eax,%eax,2),%edx + e3e: c1 e2 04 shl $0x4,%edx + e41: 03 15 00 00 00 00 add 0x0,%edx + e47: eb 12 jmp e5b + e49: 8d b4 26 00 00 00 00 lea 0x0(%esi,1),%esi + e50: 56 push %esi + e51: e8 fc ff ff ff call e52 + e56: 89 c2 mov %eax,%edx + e58: 83 c4 04 add $0x4,%esp + end_cur = end; + /* set up first page */ + pb->pb_pages[0] = mem_to_page(mem); + e5b: 8b 87 84 00 00 00 mov 0x84(%edi),%eax + e61: 89 10 mov %edx,(%eax) + + ptr += PAGE_CACHE_SIZE; + e63: 81 c3 00 10 00 00 add $0x1000,%ebx + pb->pb_page_count = ++i; + e69: be 01 00 00 00 mov $0x1,%esi + e6e: 66 c7 47 7e 01 00 movw $0x1,0x7e(%edi) + while (ptr < end) { + e74: 39 eb cmp %ebp,%ebx + e76: 73 5b jae ed3 + e78: a1 00 00 00 00 mov 0x0,%eax + e7d: 05 ff ff ff 00 add $0xffffff,%eax + e82: 25 00 00 80 ff and $0xff800000,%eax + e87: 39 c3 cmp %eax,%ebx + e89: 72 08 jb e93 + e8b: 81 fb ff df ff fd cmp $0xfdffdfff,%ebx + e91: 76 1d jbe eb0 + e93: 8d 83 00 00 00 40 lea 0x40000000(%ebx),%eax + e99: c1 e8 0c shr $0xc,%eax + e9c: 8d 14 40 lea (%eax,%eax,2),%edx + e9f: c1 e2 04 shl $0x4,%edx + ea2: 03 15 00 00 00 00 add 0x0,%edx + ea8: eb 11 jmp ebb + eaa: 8d b6 00 00 00 00 lea 0x0(%esi),%esi + eb0: 53 push %ebx + eb1: e8 fc ff ff ff call eb2 + eb6: 89 c2 mov %eax,%edx + eb8: 83 c4 04 add $0x4,%esp + pb->pb_pages[i] = mem_to_page((void *)ptr); + ebb: 8b 87 84 00 00 00 mov 0x84(%edi),%eax + ec1: 89 14 b0 mov %edx,(%eax,%esi,4) + pb->pb_page_count = ++i; + ec4: 46 inc %esi + ec5: 66 89 77 7e mov %si,0x7e(%edi) + ptr += PAGE_CACHE_SIZE; + ec9: 81 c3 00 10 00 00 add $0x1000,%ebx + } + ecf: 39 eb cmp %ebp,%ebx + ed1: 72 a5 jb e78 + pb->pb_locked = 0; + ed3: c6 87 82 00 00 00 00 movb $0x0,0x82(%edi) + + pb->pb_count_desired = pb->pb_buffer_length = len; + eda: 8b 44 24 1c mov 0x1c(%esp,1),%eax + ede: 89 47 2c mov %eax,0x2c(%edi) + ee1: 89 47 30 mov %eax,0x30(%edi) + pb->pb_flags |= PBF_MAPPED | _PBF_PRIVATE_BH; + ee4: 81 4f 08 04 00 10 00 orl $0x100004,0x8(%edi) + + return 0; + eeb: 31 c0 xor %eax,%eax + eed: 5b pop %ebx + eee: 5e pop %esi + eef: 5f pop %edi + ef0: 5d pop %ebp + ef1: c3 ret +} + ef2: 89 f6 mov %esi,%esi + +0000000000000ef4 : + +page_buf_t * +pagebuf_get_no_daddr( + size_t len, + pb_target_t *target) +{ + ef4: 55 push %ebp + ef5: 57 push %edi + ef6: 56 push %esi + ef7: 53 push %ebx + ef8: 8b 6c 24 14 mov 0x14(%esp,1),%ebp + int rval; + void *rmem = NULL; + efc: 31 f6 xor %esi,%esi + int flags = _PBF_LOCKABLE | PBF_FORCEIO; + page_buf_t *pb; + size_t tlen = 0; + efe: 31 ff xor %edi,%edi + + if (len > 0x20000) + f00: 81 fd 00 00 02 00 cmp $0x20000,%ebp + f06: 77 19 ja f21 + return(NULL); + + pb = pagebuf_allocate(flags); + f08: a1 24 00 00 00 mov 0x24,%eax + f0d: 68 f0 01 00 00 push $0x1f0 + f12: 50 push %eax + f13: e8 fc ff ff ff call f14 + f18: 89 c3 mov %eax,%ebx + if (!pb) + f1a: 83 c4 08 add $0x8,%esp + f1d: 85 db test %ebx,%ebx + f1f: 75 07 jne f28 + return NULL; + f21: 31 c0 xor %eax,%eax + f23: e9 98 00 00 00 jmp fc0 + + _pagebuf_initialize(pb, target, 0, len, flags); + f28: 68 00 00 08 08 push $0x8080000 + f2d: 55 push %ebp + f2e: 6a 00 push $0x0 + f30: 6a 00 push $0x0 + f32: 8b 44 24 28 mov 0x28(%esp,1),%eax + f36: 50 push %eax + f37: 53 push %ebx + f38: e8 43 f2 ff ff call 180 <_pagebuf_initialize> + + do { + f3d: 83 c4 18 add $0x18,%esp + if (tlen == 0) { + f40: 85 ff test %edi,%edi + f42: 75 04 jne f48 + tlen = len; /* first time */ + f44: 89 ef mov %ebp,%edi + } else { + f46: eb 0b jmp f53 + kfree(rmem); /* free the mem from the previous try */ + f48: 56 push %esi + f49: e8 fc ff ff ff call f4a + tlen <<= 1; /* double the size and try again */ + f4e: 01 ff add %edi,%edi + /* + printk( + "pb_get_no_daddr NOT block 0x%p mask 0x%p len %d\n", + rmem, ((size_t)rmem & (size_t)~SECTOR_MASK), + len); + */ + } + f50: 83 c4 04 add $0x4,%esp + if ((rmem = kmalloc(tlen, GFP_KERNEL)) == 0) { + f53: 68 f0 01 00 00 push $0x1f0 + f58: 57 push %edi + f59: e8 fc ff ff ff call f5a + f5e: 89 c6 mov %eax,%esi + f60: 83 c4 08 add $0x8,%esp + f63: 85 f6 test %esi,%esi + f65: 74 2b je f92 + pagebuf_free(pb); + return NULL; + } + } while ((size_t)rmem != ((size_t)rmem & (size_t)~SECTOR_MASK)); + f67: 25 00 fe ff ff and $0xfffffe00,%eax + f6c: 39 c6 cmp %eax,%esi + f6e: 75 d0 jne f40 + + if ((rval = pagebuf_associate_memory(pb, rmem, len)) != 0) { + f70: 55 push %ebp + f71: 56 push %esi + f72: 53 push %ebx + f73: e8 fc ff ff ff call f74 + f78: 83 c4 0c add $0xc,%esp + f7b: 85 c0 test %eax,%eax + f7d: 74 21 je fa0 + kfree(rmem); + f7f: 56 push %esi + f80: e8 fc ff ff ff call f81 + pagebuf_free(pb); + f85: 53 push %ebx + f86: e8 fc ff ff ff call f87 + return NULL; + f8b: 31 c0 xor %eax,%eax + f8d: 83 c4 08 add $0x8,%esp + f90: eb 2e jmp fc0 + f92: 53 push %ebx + f93: e8 fc ff ff ff call f94 + f98: 31 c0 xor %eax,%eax + f9a: 83 c4 04 add $0x4,%esp + f9d: eb 21 jmp fc0 + f9f: 90 nop + } + /* otherwise pagebuf_free just ignores it */ + pb->pb_flags |= _PBF_MEM_ALLOCATED; + fa0: 80 4b 0b 02 orb $0x2,0xb(%ebx) + * The default case (no contention) will result in NO + * jumps for both down() and up(). + */ +static inline void up(struct semaphore * sem) +{ + fa4: 8d 8b 98 00 00 00 lea 0x98(%ebx),%ecx +#if WAITQUEUE_DEBUG + CHECK_MAGIC(sem->__magic); +#endif + __asm__ __volatile__( + faa: f0 ff 83 98 00 00 00 lock incl 0x98(%ebx) + fb1: 0f 8e 60 1e 00 00 jle 2e17 + up(&PBP(pb)->pb_sema); /* Return unlocked pagebuf */ + + PB_TRACE(pb, PB_TRACE_REC(no_daddr), rmem); + + return pb; + fb7: 89 d8 mov %ebx,%eax +} + fb9: 8d b4 26 00 00 00 00 lea 0x0(%esi,1),%esi + fc0: 5b pop %ebx + fc1: 5e pop %esi + fc2: 5f pop %edi + fc3: 5d pop %ebp + fc4: c3 ret + fc5: 8d 76 00 lea 0x0(%esi),%esi + +0000000000000fc8 : + + +/* + * pagebuf_hold + * + * Increment reference count on buffer, to hold the buffer concurrently + * with another thread which may release (free) the buffer asynchronously. + * + * Must hold the buffer already to call this function. + */ +void +pagebuf_hold( + page_buf_t *pb) +{ + fc8: 8b 44 24 04 mov 0x4(%esp,1),%eax + * useful range of an atomic_t is only 24 bits. + */ +static __inline__ void atomic_inc(atomic_t *v) +{ + __asm__ __volatile__( + fcc: f0 ff 40 18 lock incl 0x18(%eax) + fd0: c3 ret + atomic_inc(&pb->pb_hold); + PB_TRACE(pb, PB_TRACE_REC(hold), 0); +} + fd1: 8d 76 00 lea 0x0(%esi),%esi + +0000000000000fd4 : + +/* + * pagebuf_free + * + * pagebuf_free releases the specified buffer. The modification + * state of any associated pages is left unchanged. + */ +void +pagebuf_free( + page_buf_t *pb) +{ + fd4: 57 push %edi + fd5: 56 push %esi + fd6: 53 push %ebx + fd7: 8b 7c 24 10 mov 0x10(%esp,1),%edi + if (pb->pb_flags & _PBF_LOCKABLE) { + fdb: f6 47 0a 08 testb $0x8,0xa(%edi) + fdf: 74 44 je 1025 + pb_hash_t *h = pb_hash(pb); + fe1: 0f b6 87 83 00 00 00 movzbl 0x83(%edi),%eax + fe8: 8d 04 80 lea (%eax,%eax,4),%eax + feb: 8d 34 85 60 03 00 00 lea 0x360(,%eax,4),%esi + return oldval > 0; +} + +static inline void spin_lock(spinlock_t *lock) +{ + ff2: 8d 5e 0c lea 0xc(%esi),%ebx +#if SPINLOCK_DEBUG + __label__ here; +here: + if (lock->magic != SPINLOCK_MAGIC) { + ff5: 81 7b 04 ad 4e ad de cmpl $0xdead4ead,0x4(%ebx) + ffc: 74 1a je 1018 +printk("eip: %p\n", &&here); + ffe: 68 f5 0f 00 00 push $0xff5 + 1003: 68 2b 00 00 00 push $0x2b + 1008: e8 fc ff ff ff call 1009 + BUG(); + 100d: 0f 0b ud2a + 100f: 85 00 test %eax,(%eax) + 1011: 00 00 add %al,(%eax) + 1013: 00 00 add %al,(%eax) + } + 1015: 83 c4 08 add $0x8,%esp +#endif + __asm__ __volatile__( + 1018: f0 fe 0b lock decb (%ebx) + 101b: 0f 88 00 1e 00 00 js 2e21 + + spin_lock(&h->pb_hash_lock); + _pagebuf_free_object(h, pb); + 1021: 57 push %edi + 1022: 56 push %esi + } else { + 1023: eb 03 jmp 1028 + _pagebuf_free_object(NULL, pb); + 1025: 57 push %edi + 1026: 6a 00 push $0x0 + 1028: e8 fc ff ff ff call 1029 + } + 102d: 83 c4 08 add $0x8,%esp + 1030: 5b pop %ebx + 1031: 5e pop %esi + 1032: 5f pop %edi + 1033: c3 ret + +0000000000001034 : +} + +/* + * pagebuf_rele + * + * pagebuf_rele releases a hold on the specified buffer. If the + * the hold count is 1, pagebuf_rele calls pagebuf_free. + */ +void +pagebuf_rele( + page_buf_t *pb) +{ + 1034: 57 push %edi + 1035: 56 push %esi + 1036: 53 push %ebx + 1037: 8b 74 24 10 mov 0x10(%esp,1),%esi + pb_hash_t *h; + + PB_TRACE(pb, PB_TRACE_REC(rele), pb->pb_relse); + if (pb->pb_flags & _PBF_LOCKABLE) { + 103b: f6 46 0a 08 testb $0x8,0xa(%esi) + 103f: 74 42 je 1083 + h = pb_hash(pb); + 1041: 0f b6 86 83 00 00 00 movzbl 0x83(%esi),%eax + 1048: 8d 04 80 lea (%eax,%eax,4),%eax + 104b: 8d 1c 85 60 03 00 00 lea 0x360(,%eax,4),%ebx + return oldval > 0; +} + +static inline void spin_lock(spinlock_t *lock) +{ + 1052: 8d 7b 0c lea 0xc(%ebx),%edi +#if SPINLOCK_DEBUG + __label__ here; +here: + if (lock->magic != SPINLOCK_MAGIC) { + 1055: 81 7f 04 ad 4e ad de cmpl $0xdead4ead,0x4(%edi) + 105c: 74 1a je 1078 +printk("eip: %p\n", &&here); + 105e: 68 55 10 00 00 push $0x1055 + 1063: 68 2b 00 00 00 push $0x2b + 1068: e8 fc ff ff ff call 1069 + BUG(); + 106d: 0f 0b ud2a + 106f: 85 00 test %eax,(%eax) + 1071: 00 00 add %al,(%eax) + 1073: 00 00 add %al,(%eax) + } + 1075: 83 c4 08 add $0x8,%esp +#endif + __asm__ __volatile__( + 1078: f0 fe 0f lock decb (%edi) + 107b: 0f 88 ac 1d 00 00 js 2e2d + spin_lock(&h->pb_hash_lock); + } else { + 1081: eb 02 jmp 1085 + h = NULL; + 1083: 31 db xor %ebx,%ebx +static __inline__ int atomic_dec_and_test(atomic_t *v) +{ + unsigned char c; + + __asm__ __volatile__( + 1085: f0 ff 4e 18 lock decl 0x18(%esi) + 1089: 0f 94 c0 sete %al + } + + if (atomic_dec_and_test(&pb->pb_hold)) { + 108c: 84 c0 test %al,%al + 108e: 0f 84 cd 00 00 00 je 1161 + int do_free = 1; + 1094: ba 01 00 00 00 mov $0x1,%edx + + if (pb->pb_relse) { + 1099: 83 7e 50 00 cmpl $0x0,0x50(%esi) + 109d: 74 36 je 10d5 + * useful range of an atomic_t is only 24 bits. + */ +static __inline__ void atomic_inc(atomic_t *v) +{ + __asm__ __volatile__( + 109f: f0 ff 46 18 lock incl 0x18(%esi) + atomic_inc(&pb->pb_hold); + if (h) + 10a3: 85 db test %ebx,%ebx + 10a5: 74 23 je 10ca +static inline void spin_unlock(spinlock_t *lock) +{ + char oldval = 1; +#if SPINLOCK_DEBUG + if (lock->magic != SPINLOCK_MAGIC) + 10a7: 81 7b 10 ad 4e ad de cmpl $0xdead4ead,0x10(%ebx) + 10ae: 74 08 je 10b8 + BUG(); + 10b0: 0f 0b ud2a + 10b2: 69 00 00 00 00 00 imul $0x0,(%eax),%eax + if (!spin_is_locked(lock)) + 10b8: 8a 43 0c mov 0xc(%ebx),%al + 10bb: 84 c0 test %al,%al + 10bd: 7e 08 jle 10c7 + BUG(); + 10bf: 0f 0b ud2a + 10c1: 6b 00 00 imul $0x0,(%eax),%eax + 10c4: 00 00 add %al,(%eax) + 10c6: 00 86 53 0c 56 8b add %al,0x8b560c53(%esi) + spin_unlock(&h->pb_hash_lock); + (*(pb->pb_relse)) (pb); + 10cc: 46 inc %esi + 10cd: 50 push %eax + 10ce: ff d0 call *%eax + do_free = 0; + 10d0: 31 d2 xor %edx,%edx + } + 10d2: 83 c4 04 add $0x4,%esp + if (pb->pb_flags & PBF_DELWRI) { + 10d5: 8b 46 08 mov 0x8(%esi),%eax + 10d8: a8 40 test $0x40,%al + 10da: 74 45 je 1121 + pb->pb_flags |= PBF_ASYNC; + 10dc: 0c 10 or $0x10,%al + 10de: 89 46 08 mov %eax,0x8(%esi) + * useful range of an atomic_t is only 24 bits. + */ +static __inline__ void atomic_inc(atomic_t *v) +{ + __asm__ __volatile__( + 10e1: f0 ff 46 18 lock incl 0x18(%esi) + atomic_inc(&pb->pb_hold); + if (h && do_free) + 10e5: 85 db test %ebx,%ebx + 10e7: 74 29 je 1112 + 10e9: 85 d2 test %edx,%edx + 10eb: 74 25 je 1112 + :"0" (oldval) : "memory" + +static inline void spin_unlock(spinlock_t *lock) +{ + char oldval = 1; + 10ed: b2 01 mov $0x1,%dl +#if SPINLOCK_DEBUG + if (lock->magic != SPINLOCK_MAGIC) + 10ef: 81 7b 10 ad 4e ad de cmpl $0xdead4ead,0x10(%ebx) + 10f6: 74 08 je 1100 + BUG(); + 10f8: 0f 0b ud2a + 10fa: 69 00 00 00 00 00 imul $0x0,(%eax),%eax + if (!spin_is_locked(lock)) + 1100: 8a 43 0c mov 0xc(%ebx),%al + 1103: 84 c0 test %al,%al + 1105: 7e 08 jle 110f + BUG(); + 1107: 0f 0b ud2a + 1109: 6b 00 00 imul $0x0,(%eax),%eax + 110c: 00 00 add %al,(%eax) + 110e: 00 86 53 0c 6a 00 add %al,0x6a0c53(%esi) + spin_unlock(&h->pb_hash_lock); + pagebuf_delwri_queue(pb, 0); + 1114: 56 push %esi + 1115: e8 fc ff ff ff call 1116 + do_free = 0; + 111a: 31 d2 xor %edx,%edx + } else if (pb->pb_flags & PBF_FS_MANAGED) { + 111c: 83 c4 08 add $0x8,%esp + 111f: eb 30 jmp 1151 + 1121: f6 c4 08 test $0x8,%ah + 1124: 74 2b je 1151 + if (h) + 1126: 85 db test %ebx,%ebx + 1128: 74 25 je 114f + :"0" (oldval) : "memory" + +static inline void spin_unlock(spinlock_t *lock) +{ + char oldval = 1; + 112a: b2 01 mov $0x1,%dl +#if SPINLOCK_DEBUG + if (lock->magic != SPINLOCK_MAGIC) + 112c: 81 7b 10 ad 4e ad de cmpl $0xdead4ead,0x10(%ebx) + 1133: 74 08 je 113d + BUG(); + 1135: 0f 0b ud2a + 1137: 69 00 00 00 00 00 imul $0x0,(%eax),%eax + if (!spin_is_locked(lock)) + 113d: 8a 43 0c mov 0xc(%ebx),%al + 1140: 84 c0 test %al,%al + 1142: 7e 08 jle 114c + BUG(); + 1144: 0f 0b ud2a + 1146: 6b 00 00 imul $0x0,(%eax),%eax + 1149: 00 00 add %al,(%eax) + 114b: 00 86 53 0c 31 d2 add %al,0xd2310c53(%esi) + spin_unlock(&h->pb_hash_lock); + do_free = 0; + } + + if (do_free) { + 1151: 85 d2 test %edx,%edx + 1153: 74 35 je 118a + _pagebuf_free_object(h, pb); + 1155: 56 push %esi + 1156: 53 push %ebx + 1157: e8 fc ff ff ff call 1158 + } + 115c: 83 c4 08 add $0x8,%esp + } else if (h) { + 115f: eb 29 jmp 118a + 1161: 85 db test %ebx,%ebx + 1163: 74 25 je 118a + :"0" (oldval) : "memory" + +static inline void spin_unlock(spinlock_t *lock) +{ + char oldval = 1; + 1165: b2 01 mov $0x1,%dl +#if SPINLOCK_DEBUG + if (lock->magic != SPINLOCK_MAGIC) + 1167: 81 7b 10 ad 4e ad de cmpl $0xdead4ead,0x10(%ebx) + 116e: 74 08 je 1178 + BUG(); + 1170: 0f 0b ud2a + 1172: 69 00 00 00 00 00 imul $0x0,(%eax),%eax + if (!spin_is_locked(lock)) + 1178: 8a 43 0c mov 0xc(%ebx),%al + 117b: 84 c0 test %al,%al + 117d: 7e 08 jle 1187 + BUG(); + 117f: 0f 0b ud2a + 1181: 6b 00 00 imul $0x0,(%eax),%eax + 1184: 00 00 add %al,(%eax) + 1186: 00 86 53 0c 5b 5e add %al,0x5e5b0c53(%esi) + spin_unlock(&h->pb_hash_lock); + } + 118c: 5f pop %edi + 118d: c3 ret +} + 118e: 89 f6 mov %esi,%esi + +0000000000001190 : + + +/* + * Pinning Buffer Storage in Memory + */ + +/* + * pagebuf_pin + * + * pagebuf_pin locks all of the memory represented by a buffer in + * memory. Multiple calls to pagebuf_pin and pagebuf_unpin, for + * the same or different buffers affecting a given page, will + * properly count the number of outstanding "pin" requests. The + * buffer may be released after the pagebuf_pin and a different + * buffer used when calling pagebuf_unpin, if desired. + * pagebuf_pin should be used by the file system when it wants be + * assured that no attempt will be made to force the affected + * memory to disk. It does not assure that a given logical page + * will not be moved to a different physical page. + */ +void +pagebuf_pin( + page_buf_t *pb) +{ + 1190: 8b 44 24 04 mov 0x4(%esp,1),%eax + * useful range of an atomic_t is only 24 bits. + */ +static __inline__ void atomic_inc(atomic_t *v) +{ + __asm__ __volatile__( + 1194: f0 ff 80 b8 00 00 00 lock incl 0xb8(%eax) + 119b: c3 ret + +000000000000119c : + atomic_inc(&PBP(pb)->pb_pin_count); + PB_TRACE(pb, PB_TRACE_REC(pin), PBP(pb)->pb_pin_count.counter); +} + +/* + * pagebuf_unpin + * + * pagebuf_unpin reverses the locking of memory performed by + * pagebuf_pin. Note that both functions affected the logical + * pages associated with the buffer, not the buffer itself. + */ +void +pagebuf_unpin( + page_buf_t *pb) +{ + 119c: 8b 54 24 04 mov 0x4(%esp,1),%edx +static __inline__ int atomic_dec_and_test(atomic_t *v) +{ + unsigned char c; + + __asm__ __volatile__( + 11a0: f0 ff 8a b8 00 00 00 lock decl 0xb8(%edx) + 11a7: 0f 94 c0 sete %al + if (atomic_dec_and_test(&PBP(pb)->pb_pin_count)) { + 11aa: 84 c0 test %al,%al + 11ac: 74 12 je 11c0 + wake_up_all(&PBP(pb)->pb_waiters); + 11ae: 8d 82 bc 00 00 00 lea 0xbc(%edx),%eax + 11b4: 31 c9 xor %ecx,%ecx + 11b6: ba 03 00 00 00 mov $0x3,%edx + 11bb: e8 fc ff ff ff call 11bc + } + 11c0: c3 ret + PB_TRACE(pb, PB_TRACE_REC(unpin), PBP(pb)->pb_pin_count.counter); +} + 11c1: 8d 76 00 lea 0x0(%esi),%esi + +00000000000011c4 : + +int +pagebuf_ispin( + page_buf_t *pb) +{ + 11c4: 8b 44 24 04 mov 0x4(%esp,1),%eax + return atomic_read(&PBP(pb)->pb_pin_count); + 11c8: 8b 80 b8 00 00 00 mov 0xb8(%eax),%eax + 11ce: c3 ret + 11cf: 90 nop + +00000000000011d0 : +} + +/* + * pagebuf_wait_unpin + * + * pagebuf_wait_unpin waits until all of the memory associated + * with the buffer is not longer locked in memory. It returns + * immediately if none of the affected pages are locked. + */ +static inline void +_pagebuf_wait_unpin( + page_buf_t *pb) +{ + DECLARE_WAITQUEUE (wait, current); + + if (atomic_read(&PBP(pb)->pb_pin_count) == 0) + return; + + add_wait_queue(&PBP(pb)->pb_waiters, &wait); + for (;;) { + current->state = TASK_UNINTERRUPTIBLE; + if (atomic_read(&PBP(pb)->pb_pin_count) == 0) { + break; + } + run_task_queue(&tq_disk); + schedule(); + } + remove_wait_queue(&PBP(pb)->pb_waiters, &wait); + current->state = TASK_RUNNING; +} + +void +pagebuf_queue_task( + struct tq_struct *task) +{ + 11d0: 55 push %ebp + 11d1: 57 push %edi + 11d2: 56 push %esi + 11d3: 53 push %ebx + 11d4: 8b 6c 24 14 mov 0x14(%esp,1),%ebp + +static inline struct task_struct * get_current(void) +{ + struct task_struct *current; + __asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL)); + 11d8: b8 00 e0 ff ff mov $0xffffe000,%eax + 11dd: 21 e0 and %esp,%eax + * Queue a task on a tq. Return non-zero if it was successfully + * added. + */ +static inline int queue_task(struct tq_struct *bh_pointer, task_queue *bh_list) +{ + 11df: 8b 58 30 mov 0x30(%eax),%ebx + 11e2: c1 e3 03 shl $0x3,%ebx + 11e5: 81 c3 40 00 00 00 add $0x40,%ebx + int ret = 0; + 11eb: 31 c0 xor %eax,%eax +static __inline__ int test_and_set_bit(int nr, volatile void * addr) +{ + int oldbit; + + __asm__ __volatile__( LOCK_PREFIX + 11ed: f0 0f ab 45 08 lock bts %eax,0x8(%ebp) + 11f2: 19 c0 sbb %eax,%eax + */ +static inline int queue_task(struct tq_struct *bh_pointer, task_queue *bh_list) +{ + int ret = 0; + if (!test_and_set_bit(0,&bh_pointer->sync)) { + 11f4: 85 c0 test %eax,%eax + 11f6: 75 74 jne 126c + unsigned long flags; + spin_lock_irqsave(&tqueue_lock, flags); + 11f8: 9c pushf + 11f9: 5f pop %edi + 11fa: fa cli + return oldval > 0; +} + +static inline void spin_lock(spinlock_t *lock) +{ + 11fb: be 00 00 00 00 mov $0x0,%esi +#if SPINLOCK_DEBUG + __label__ here; +here: + if (lock->magic != SPINLOCK_MAGIC) { + 1200: 81 3d 04 00 00 00 ad cmpl $0xdead4ead,0x4 + 1207: 4e ad de + 120a: 74 1a je 1226 +printk("eip: %p\n", &&here); + 120c: 68 00 12 00 00 push $0x1200 + 1211: 68 2b 00 00 00 push $0x2b + 1216: e8 fc ff ff ff call 1217 + BUG(); + 121b: 0f 0b ud2a + 121d: 85 00 test %eax,(%eax) + 121f: 00 00 add %al,(%eax) + 1221: 00 00 add %al,(%eax) + } + 1223: 83 c4 08 add $0x8,%esp +#endif + __asm__ __volatile__( + 1226: f0 fe 0e lock decb (%esi) + 1229: 0f 88 0a 1c 00 00 js 2e39 + */ +static __inline__ void __list_add(struct list_head * new, + struct list_head * prev, + struct list_head * next) +{ + 122f: 8b 43 04 mov 0x4(%ebx),%eax + next->prev = new; + 1232: 89 6b 04 mov %ebp,0x4(%ebx) + new->next = next; + 1235: 89 5d 00 mov %ebx,0x0(%ebp) + new->prev = prev; + 1238: 89 45 04 mov %eax,0x4(%ebp) + prev->next = new; + 123b: 89 28 mov %ebp,(%eax) + :"0" (oldval) : "memory" + +static inline void spin_unlock(spinlock_t *lock) +{ + char oldval = 1; + 123d: b2 01 mov $0x1,%dl +#if SPINLOCK_DEBUG + if (lock->magic != SPINLOCK_MAGIC) + 123f: 81 3d 04 00 00 00 ad cmpl $0xdead4ead,0x4 + 1246: 4e ad de + 1249: 74 08 je 1253 + BUG(); + 124b: 0f 0b ud2a + 124d: 69 00 00 00 00 00 imul $0x0,(%eax),%eax + if (!spin_is_locked(lock)) + 1253: a0 00 00 00 00 mov 0x0,%al + 1258: 84 c0 test %al,%al + 125a: 7e 08 jle 1264 + BUG(); + 125c: 0f 0b ud2a + 125e: 6b 00 00 imul $0x0,(%eax),%eax + 1261: 00 00 add %al,(%eax) + 1263: 00 86 15 00 00 00 add %al,0x15(%esi) +#endif + __asm__ __volatile__( + 1269: 00 57 9d add %dl,0xffffff9d(%edi) + +static inline struct task_struct * get_current(void) +{ + struct task_struct *current; + __asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL)); + 126c: b8 00 e0 ff ff mov $0xffffe000,%eax + 1271: 21 e0 and %esp,%eax + queue_task(task, &pagebuf_iodone_tq[smp_processor_id()]); + wake_up(&pagebuf_iodone_wait[smp_processor_id()]); + 1273: 8b 40 30 mov 0x30(%eax),%eax + 1276: c1 e0 04 shl $0x4,%eax + 1279: 05 40 01 00 00 add $0x140,%eax + 127e: b9 01 00 00 00 mov $0x1,%ecx + 1283: ba 03 00 00 00 mov $0x3,%edx + 1288: e8 fc ff ff ff call 1289 + 128d: 5b pop %ebx + 128e: 5e pop %esi + 128f: 5f pop %edi + 1290: 5d pop %ebp + 1291: c3 ret +} + 1292: 89 f6 mov %esi,%esi + +0000000000001294 : + + +/* + * Buffer Utility Routines + */ + +/* + * pagebuf_iodone + * + * pagebuf_iodone marks a buffer for which I/O is in progress + * done with respect to that I/O. The pb_done routine, if + * present, will be called as a side-effect. + */ +void +pagebuf_iodone_sched( + void *v) +{ + 1294: 53 push %ebx + 1295: 8b 5c 24 08 mov 0x8(%esp,1),%ebx + page_buf_t *pb = (page_buf_t *)v; + + if (pb->pb_iodone) { + 1299: 8b 43 4c mov 0x4c(%ebx),%eax + 129c: 85 c0 test %eax,%eax + 129e: 74 05 je 12a5 + (*(pb->pb_iodone)) (pb); + 12a0: 53 push %ebx + 12a1: ff d0 call *%eax + return; + 12a3: eb 23 jmp 12c8 + } + + if (pb->pb_flags & PBF_ASYNC) { + 12a5: 8b 43 08 mov 0x8(%ebx),%eax + 12a8: a8 10 test $0x10,%al + 12aa: 74 1f je 12cb + if ((pb->pb_flags & _PBF_LOCKABLE) && !pb->pb_relse) + 12ac: a9 00 00 08 00 test $0x80000,%eax + 12b1: 74 0f je 12c2 + 12b3: 83 7b 50 00 cmpl $0x0,0x50(%ebx) + 12b7: 75 09 jne 12c2 + pagebuf_unlock(pb); + 12b9: 53 push %ebx + 12ba: e8 fc ff ff ff call 12bb + 12bf: 83 c4 04 add $0x4,%esp + pagebuf_rele(pb); + 12c2: 53 push %ebx + 12c3: e8 fc ff ff ff call 12c4 + } + 12c8: 83 c4 04 add $0x4,%esp +} + 12cb: 5b pop %ebx + 12cc: c3 ret + 12cd: 8d 76 00 lea 0x0(%esi),%esi + +00000000000012d0 : + +void +pagebuf_iodone( + page_buf_t *pb) +{ + 12d0: 55 push %ebp + 12d1: 57 push %edi + 12d2: 56 push %esi + 12d3: 53 push %ebx + 12d4: 8b 7c 24 14 mov 0x14(%esp,1),%edi + pb->pb_flags &= ~(PBF_READ | PBF_WRITE); + 12d8: 8b 47 08 mov 0x8(%edi),%eax + 12db: 89 c2 mov %eax,%edx + 12dd: 83 e2 fc and $0xfffffffc,%edx + 12e0: 89 57 08 mov %edx,0x8(%edi) + if (pb->pb_error == 0) { + 12e3: 66 83 7f 7c 00 cmpw $0x0,0x7c(%edi) + 12e8: 75 05 jne 12ef + pb->pb_flags &= ~(PBF_PARTIAL | PBF_NONE); + 12ea: 24 d4 and $0xd4,%al + 12ec: 89 47 08 mov %eax,0x8(%edi) + } + + PB_TRACE(pb, PB_TRACE_REC(done), pb->pb_iodone); + + if ((pb->pb_iodone) || (pb->pb_flags & PBF_ASYNC)) { + 12ef: 83 7f 4c 00 cmpl $0x0,0x4c(%edi) + 12f3: 75 0b jne 1300 + 12f5: f6 47 08 10 testb $0x10,0x8(%edi) + 12f9: 0f 84 d3 00 00 00 je 13d2 + 12ff: 90 nop + INIT_TQUEUE(&pb->pb_iodone_sched, + 1300: 8d 47 38 lea 0x38(%edi),%eax + 1303: 89 47 38 mov %eax,0x38(%edi) + 1306: 89 47 3c mov %eax,0x3c(%edi) + 1309: c7 47 40 00 00 00 00 movl $0x0,0x40(%edi) + 1310: c7 47 44 00 00 00 00 movl $0x0,0x44(%edi) + pagebuf_iodone_sched, (void *)pb); + 1317: 89 7f 48 mov %edi,0x48(%edi) + +static inline struct task_struct * get_current(void) +{ + struct task_struct *current; + __asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL)); + 131a: ba 00 e0 ff ff mov $0xffffe000,%edx + 131f: 21 e2 and %esp,%edx + * Queue a task on a tq. Return non-zero if it was successfully + * added. + */ +static inline int queue_task(struct tq_struct *bh_pointer, task_queue *bh_list) +{ + 1321: 8b 5a 30 mov 0x30(%edx),%ebx + 1324: 89 c6 mov %eax,%esi + 1326: c1 e3 03 shl $0x3,%ebx + 1329: 81 c3 40 00 00 00 add $0x40,%ebx + int ret = 0; + 132f: 31 c0 xor %eax,%eax +static __inline__ int test_and_set_bit(int nr, volatile void * addr) +{ + int oldbit; + + __asm__ __volatile__( LOCK_PREFIX + 1331: f0 0f ab 47 40 lock bts %eax,0x40(%edi) + 1336: 19 c0 sbb %eax,%eax + */ +static inline int queue_task(struct tq_struct *bh_pointer, task_queue *bh_list) +{ + int ret = 0; + if (!test_and_set_bit(0,&bh_pointer->sync)) { + 1338: 85 c0 test %eax,%eax + 133a: 75 73 jne 13af + unsigned long flags; + spin_lock_irqsave(&tqueue_lock, flags); + 133c: 9c pushf + 133d: 5d pop %ebp + 133e: fa cli + return oldval > 0; +} + +static inline void spin_lock(spinlock_t *lock) +{ + 133f: bf 00 00 00 00 mov $0x0,%edi +#if SPINLOCK_DEBUG + __label__ here; +here: + if (lock->magic != SPINLOCK_MAGIC) { + 1344: 81 3d 04 00 00 00 ad cmpl $0xdead4ead,0x4 + 134b: 4e ad de + 134e: 74 1a je 136a +printk("eip: %p\n", &&here); + 1350: 68 44 13 00 00 push $0x1344 + 1355: 68 2b 00 00 00 push $0x2b + 135a: e8 fc ff ff ff call 135b + BUG(); + 135f: 0f 0b ud2a + 1361: 85 00 test %eax,(%eax) + 1363: 00 00 add %al,(%eax) + 1365: 00 00 add %al,(%eax) + } + 1367: 83 c4 08 add $0x8,%esp +#endif + __asm__ __volatile__( + 136a: f0 fe 0f lock decb (%edi) + 136d: 0f 88 d2 1a 00 00 js 2e45 + */ +static __inline__ void __list_add(struct list_head * new, + struct list_head * prev, + struct list_head * next) +{ + 1373: 8b 43 04 mov 0x4(%ebx),%eax + next->prev = new; + 1376: 89 73 04 mov %esi,0x4(%ebx) + new->next = next; + 1379: 89 1e mov %ebx,(%esi) + new->prev = prev; + 137b: 89 46 04 mov %eax,0x4(%esi) + prev->next = new; + 137e: 89 30 mov %esi,(%eax) + :"0" (oldval) : "memory" + +static inline void spin_unlock(spinlock_t *lock) +{ + char oldval = 1; + 1380: b2 01 mov $0x1,%dl +#if SPINLOCK_DEBUG + if (lock->magic != SPINLOCK_MAGIC) + 1382: 81 3d 04 00 00 00 ad cmpl $0xdead4ead,0x4 + 1389: 4e ad de + 138c: 74 08 je 1396 + BUG(); + 138e: 0f 0b ud2a + 1390: 69 00 00 00 00 00 imul $0x0,(%eax),%eax + if (!spin_is_locked(lock)) + 1396: a0 00 00 00 00 mov 0x0,%al + 139b: 84 c0 test %al,%al + 139d: 7e 08 jle 13a7 + BUG(); + 139f: 0f 0b ud2a + 13a1: 6b 00 00 imul $0x0,(%eax),%eax + 13a4: 00 00 add %al,(%eax) + 13a6: 00 86 15 00 00 00 add %al,0x15(%esi) +#endif + __asm__ __volatile__( + 13ac: 00 55 9d add %dl,0xffffff9d(%ebp) + +static inline struct task_struct * get_current(void) +{ + struct task_struct *current; + __asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL)); + 13af: b8 00 e0 ff ff mov $0xffffe000,%eax + 13b4: 21 e0 and %esp,%eax + + queue_task(&pb->pb_iodone_sched, + &pagebuf_iodone_tq[smp_processor_id()]); + wake_up(&pagebuf_iodone_wait[smp_processor_id()]); + 13b6: 8b 40 30 mov 0x30(%eax),%eax + 13b9: c1 e0 04 shl $0x4,%eax + 13bc: 05 40 01 00 00 add $0x140,%eax + 13c1: b9 01 00 00 00 mov $0x1,%ecx + 13c6: ba 03 00 00 00 mov $0x3,%edx + 13cb: e8 fc ff ff ff call 13cc + } else { + 13d0: eb 0f jmp 13e1 + * The default case (no contention) will result in NO + * jumps for both down() and up(). + */ +static inline void up(struct semaphore * sem) +{ + 13d2: 8d 47 58 lea 0x58(%edi),%eax +#if WAITQUEUE_DEBUG + CHECK_MAGIC(sem->__magic); +#endif + __asm__ __volatile__( + 13d5: 89 c1 mov %eax,%ecx + 13d7: f0 ff 47 58 lock incl 0x58(%edi) + 13db: 0f 8e 70 1a 00 00 jle 2e51 + up(&pb->pb_iodonesema); + } + 13e1: 5b pop %ebx + 13e2: 5e pop %esi + 13e3: 5f pop %edi + 13e4: 5d pop %ebp + 13e5: c3 ret +} + 13e6: 89 f6 mov %esi,%esi + +00000000000013e8 : + +/* + * pagebuf_ioerror + * + * pagebuf_ioerror sets the error code for a buffer. + */ +void +pagebuf_ioerror( /* mark/clear buffer error flag */ + page_buf_t *pb, /* buffer to mark */ + unsigned int error) /* error to store (0 if none) */ +{ + 13e8: 8b 54 24 04 mov 0x4(%esp,1),%edx + 13ec: 8b 44 24 08 mov 0x8(%esp,1),%eax + pb->pb_error = error; + 13f0: 66 89 42 7c mov %ax,0x7c(%edx) + 13f4: c3 ret + PB_TRACE(pb, PB_TRACE_REC(ioerror), error); + 13f5: 8d 76 00 lea 0x0(%esi),%esi + +00000000000013f8 : +} + +/* + * pagebuf_iostart + * + * pagebuf_iostart initiates I/O on a buffer, based on the flags supplied. + * If necessary, it will arrange for any disk space allocation required, + * and it will break up the request if the block mappings require it. + * An pb_iodone routine in the buffer supplied will only be called + * when all of the subsidiary I/O requests, if any, have been completed. + * pagebuf_iostart calls the pagebuf_ioinitiate routine or + * pagebuf_iorequest, if the former routine is not defined, to start + * the I/O on a given low-level request. + */ +int +pagebuf_iostart( /* start I/O on a buffer */ + page_buf_t *pb, /* buffer to start */ + page_buf_flags_t flags) /* PBF_LOCK, PBF_ASYNC, PBF_READ, */ + /* PBF_WRITE, PBF_ALLOCATE, */ + /* PBF_DELWRI, */ + /* PBF_SYNC, PBF_DONT_BLOCK */ + /* PBF_RELEASE */ +{ + 13f8: 56 push %esi + 13f9: 53 push %ebx + 13fa: 8b 74 24 0c mov 0xc(%esp,1),%esi + 13fe: 8b 5c 24 10 mov 0x10(%esp,1),%ebx + int status = 0; + + PB_TRACE(pb, PB_TRACE_REC(iostart), flags); + + if (flags & PBF_DELWRI) { + 1402: f6 c3 40 test $0x40,%bl + 1405: 74 1f je 1426 + pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC); + 1407: 8b 46 08 mov 0x8(%esi),%eax + 140a: 24 ec and $0xec,%al + pb->pb_flags |= flags & + 140c: 81 e3 50 01 00 00 and $0x150,%ebx + 1412: 09 d8 or %ebx,%eax + 1414: 89 46 08 mov %eax,0x8(%esi) + (PBF_DELWRI | PBF_ASYNC | PBF_SYNC); + pagebuf_delwri_queue(pb, 1); + 1417: 6a 01 push $0x1 + 1419: 56 push %esi + 141a: e8 fc ff ff ff call 141b + return status; + 141f: 31 c0 xor %eax,%eax + 1421: 83 c4 08 add $0x8,%esp + 1424: eb 55 jmp 147b + } + + pb->pb_flags &= ~(PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_DELWRI|PBF_READ_AHEAD); + 1426: 8b 56 08 mov 0x8(%esi),%edx + 1429: 81 e2 ac ff ff df and $0xdfffffac,%edx + pb->pb_flags |= flags & (PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_SYNC|PBF_READ_AHEAD); + 142f: 89 d8 mov %ebx,%eax + 1431: 25 13 01 00 20 and $0x20000113,%eax + 1436: 09 c2 or %eax,%edx + 1438: 89 56 08 mov %edx,0x8(%esi) + + if (pb->pb_bn == PAGE_BUF_DADDR_NULL) { + 143b: 83 7e 1c ff cmpl $0xffffffff,0x1c(%esi) + 143f: 75 0e jne 144f + 1441: 83 7e 20 ff cmpl $0xffffffff,0x20(%esi) + 1445: 75 08 jne 144f + BUG(); + 1447: 0f 0b ud2a + 1449: 63 05 34 00 00 00 arpl %ax,0x34 + } + + /* For writes call internal function which checks for + * filesystem specific callout function and execute it. + */ + if (flags & PBF_WRITE) { + 144f: f6 c3 02 test $0x2,%bl + 1452: 74 0c je 1460 +extern void pagebuf_terminate(void); + +static __inline__ int __pagebuf_iorequest(page_buf_t *pb) +{ + if (pb->pb_strat) + 1454: 8b 46 54 mov 0x54(%esi),%eax + 1457: 85 c0 test %eax,%eax + 1459: 74 05 je 1460 + return pb->pb_strat(pb); + 145b: 56 push %esi + 145c: ff d0 call *%eax + 145e: eb 06 jmp 1466 + status = __pagebuf_iorequest(pb); + } else { + status = pagebuf_iorequest(pb); + 1460: 56 push %esi + 1461: e8 fc ff ff ff call 1462 + } + 1466: 83 c4 04 add $0x4,%esp + + /* Wait for I/O if we are not an async request */ + if ((status == 0) && (flags & PBF_ASYNC) == 0) { + 1469: 85 c0 test %eax,%eax + 146b: 75 0e jne 147b + 146d: f6 c3 10 test $0x10,%bl + 1470: 75 09 jne 147b + status = pagebuf_iowait(pb); + 1472: 56 push %esi + 1473: e8 fc ff ff ff call 1474 + } + 1478: 83 c4 04 add $0x4,%esp + + return status; +} + 147b: 5b pop %ebx + 147c: 5e pop %esi + 147d: c3 ret + 147e: 89 f6 mov %esi,%esi + +0000000000001480 <_end_pagebuf_page_io>: + + +/* + * Helper routines for pagebuf_iorequest (pagebuf I/O completion) + * + * (different routines for locked/unlocked, and single/multi-bh pagebufs) + */ + +STATIC inline void +_pb_io_done( + page_buf_t *pb) +{ + if (atomic_dec_and_test(&PBP(pb)->pb_io_remaining) == 1) { + pb->pb_locked = 0; + pagebuf_iodone(pb); + } +} + +STATIC void +_end_pagebuf_page_io( + struct buffer_head *bh, + int uptodate, + int locked) +{ + 1480: 83 ec 04 sub $0x4,%esp + 1483: 55 push %ebp + 1484: 57 push %edi + 1485: 56 push %esi + 1486: 53 push %ebx + 1487: 8b 5c 24 18 mov 0x18(%esp,1),%ebx + 148b: 8b 44 24 1c mov 0x1c(%esp,1),%eax + struct page *page; + page_buf_t *pb = (page_buf_t *) bh->b_private; + 148f: 8b 6b 40 mov 0x40(%ebx),%ebp + * This is called by bh->b_end_io() handlers when I/O has completed. + */ +static inline void mark_buffer_uptodate(struct buffer_head * bh, int on) +{ + if (on) + 1492: 85 c0 test %eax,%eax + 1494: 74 0a je 14a0 <_end_pagebuf_page_io+0x20> + * Note that @nr may be almost arbitrarily large; this function is not + * restricted to acting on a single-word quantity. + */ +static __inline__ void set_bit(int nr, volatile void * addr) +{ + 1496: 31 c0 xor %eax,%eax + __asm__ __volatile__( LOCK_PREFIX + 1498: f0 0f ab 43 18 lock bts %eax,0x18(%ebx) + */ +static inline void mark_buffer_uptodate(struct buffer_head * bh, int on) +{ + if (on) + set_bit(BH_Uptodate, &bh->b_state); + 149d: eb 06 jmp 14a5 <_end_pagebuf_page_io+0x25> + 149f: 90 nop + * in order to ensure changes are visible on other processors. + */ +static __inline__ void clear_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__( LOCK_PREFIX + 14a0: f0 0f b3 43 18 lock btr %eax,0x18(%ebx) + * useful range of an atomic_t is only 24 bits. + */ +static __inline__ void atomic_dec(atomic_t *v) +{ + __asm__ __volatile__( + 14a5: f0 ff 4b 10 lock decl 0x10(%ebx) + + mark_buffer_uptodate(bh, uptodate); + atomic_dec(&bh->b_count); + + page = bh->b_page; + 14a9: 8b 43 38 mov 0x38(%ebx),%eax + 14ac: 89 44 24 10 mov %eax,0x10(%esp,1) +#endif + +static __inline__ int constant_test_bit(int nr, const volatile void * addr) +{ + return ((1UL << (nr & 31)) & (((const volatile unsigned int *) addr)[nr >> 5])) != 0; + 14b0: 8b 43 18 mov 0x18(%ebx),%eax + if (!test_bit(BH_Uptodate, &bh->b_state)) { + 14b3: a8 01 test $0x1,%al + 14b5: 75 14 jne 14cb <_end_pagebuf_page_io+0x4b> + * restricted to acting on a single-word quantity. + */ +static __inline__ void set_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__( LOCK_PREFIX + 14b7: 8b 54 24 10 mov 0x10(%esp,1),%edx + 14bb: b8 01 00 00 00 mov $0x1,%eax + 14c0: f0 0f ab 42 18 lock bts %eax,0x18(%edx) + set_bit(PG_error, &page->flags); + pb->pb_error = EIO; + 14c5: 66 c7 45 7c 05 00 movw $0x5,0x7c(%ebp) + } + + unlock_buffer(bh); + 14cb: 89 d8 mov %ebx,%eax + 14cd: e8 fc ff ff ff call 14ce <_end_pagebuf_page_io+0x4e> + 14d2: 83 3d 00 00 00 00 40 cmpl $0x40,0x0 + 14d9: 75 15 jne 14f0 <_end_pagebuf_page_io+0x70> + 14db: a1 00 00 00 00 mov 0x0,%eax + 14e0: 53 push %ebx + 14e1: 50 push %eax + 14e2: e8 fc ff ff ff call 14e3 <_end_pagebuf_page_io+0x63> + 14e7: 83 c4 08 add $0x8,%esp + 14ea: e9 a1 00 00 00 jmp 1590 <_end_pagebuf_page_io+0x110> + 14ef: 90 nop + 14f0: 9c pushf + 14f1: 5f pop %edi + 14f2: fa cli + return oldval > 0; +} + +static inline void spin_lock(spinlock_t *lock) +{ + 14f3: be 00 00 00 00 mov $0x0,%esi +#if SPINLOCK_DEBUG + __label__ here; +here: + if (lock->magic != SPINLOCK_MAGIC) { + 14f8: 81 3d 04 00 00 00 ad cmpl $0xdead4ead,0x4 + 14ff: 4e ad de + 1502: 74 1a je 151e <_end_pagebuf_page_io+0x9e> +printk("eip: %p\n", &&here); + 1504: 68 f8 14 00 00 push $0x14f8 + 1509: 68 2b 00 00 00 push $0x2b + 150e: e8 fc ff ff ff call 150f <_end_pagebuf_page_io+0x8f> + BUG(); + 1513: 0f 0b ud2a + 1515: 85 00 test %eax,(%eax) + 1517: 00 00 add %al,(%eax) + 1519: 00 00 add %al,(%eax) + } + 151b: 83 c4 08 add $0x8,%esp +#endif + __asm__ __volatile__( + 151e: f0 fe 0e lock decb (%esi) + 1521: 0f 88 34 19 00 00 js 2e5b + 1527: c7 43 30 00 00 00 00 movl $0x0,0x30(%ebx) + 152e: a1 00 00 00 00 mov 0x0,%eax + 1533: 89 03 mov %eax,(%ebx) + 1535: 89 1d 00 00 00 00 mov %ebx,0x0 + 153b: ff 05 00 00 00 00 incl 0x0 + 1541: 81 3d 48 03 00 00 48 cmpl $0x348,0x348 + 1548: 03 00 00 + 154b: 74 14 je 1561 <_end_pagebuf_page_io+0xe1> + 154d: b9 01 00 00 00 mov $0x1,%ecx + 1552: ba 03 00 00 00 mov $0x3,%edx + 1557: b8 40 03 00 00 mov $0x340,%eax + 155c: e8 fc ff ff ff call 155d <_end_pagebuf_page_io+0xdd> + :"0" (oldval) : "memory" + +static inline void spin_unlock(spinlock_t *lock) +{ + char oldval = 1; + 1561: b2 01 mov $0x1,%dl +#if SPINLOCK_DEBUG + if (lock->magic != SPINLOCK_MAGIC) + 1563: 81 3d 04 00 00 00 ad cmpl $0xdead4ead,0x4 + 156a: 4e ad de + 156d: 74 08 je 1577 <_end_pagebuf_page_io+0xf7> + BUG(); + 156f: 0f 0b ud2a + 1571: 69 00 00 00 00 00 imul $0x0,(%eax),%eax + if (!spin_is_locked(lock)) + 1577: a0 00 00 00 00 mov 0x0,%al + 157c: 84 c0 test %al,%al + 157e: 7e 08 jle 1588 <_end_pagebuf_page_io+0x108> + BUG(); + 1580: 0f 0b ud2a + 1582: 6b 00 00 imul $0x0,(%eax),%eax + 1585: 00 00 add %al,(%eax) + 1587: 00 86 15 00 00 00 add %al,0x15(%esi) +#endif + __asm__ __volatile__( + 158d: 00 57 9d add %dl,0xffffff9d(%edi) + * restricted to acting on a single-word quantity. + */ +static __inline__ void set_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__( LOCK_PREFIX + 1590: 8b 54 24 10 mov 0x10(%esp,1),%edx + 1594: b8 03 00 00 00 mov $0x3,%eax + 1599: f0 0f ab 42 18 lock bts %eax,0x18(%edx) + _pagebuf_free_bh(bh); + + SetPageUptodate(page); + if (locked) + 159e: 83 7c 24 20 00 cmpl $0x0,0x20(%esp,1) + 15a3: 74 09 je 15ae <_end_pagebuf_page_io+0x12e> + unlock_page(page); + 15a5: 8b 44 24 10 mov 0x10(%esp,1),%eax + 15a9: e8 fc ff ff ff call 15aa <_end_pagebuf_page_io+0x12a> +static __inline__ int atomic_dec_and_test(atomic_t *v) +{ + unsigned char c; + + __asm__ __volatile__( + 15ae: f0 ff 8d b4 00 00 00 lock decl 0xb4(%ebp) + 15b5: 0f 94 c0 sete %al + 15b8: 84 c0 test %al,%al + 15ba: 74 10 je 15cc <_end_pagebuf_page_io+0x14c> + 15bc: c6 85 82 00 00 00 00 movb $0x0,0x82(%ebp) + 15c3: 55 push %ebp + 15c4: e8 fc ff ff ff call 15c5 <_end_pagebuf_page_io+0x145> + 15c9: 83 c4 04 add $0x4,%esp + 15cc: 5b pop %ebx + 15cd: 5e pop %esi + 15ce: 5f pop %edi + 15cf: 5d pop %ebp + 15d0: 59 pop %ecx + 15d1: c3 ret + _pb_io_done(pb); +} + 15d2: 89 f6 mov %esi,%esi + +00000000000015d4 <_end_io_locked>: + +STATIC void +_end_io_locked( + struct buffer_head *bh, + int uptodate) +{ + 15d4: 8b 54 24 04 mov 0x4(%esp,1),%edx + 15d8: 8b 44 24 08 mov 0x8(%esp,1),%eax + _end_pagebuf_page_io(bh, uptodate, 1); + 15dc: 6a 01 push $0x1 + 15de: 50 push %eax + 15df: 52 push %edx + 15e0: e8 9b fe ff ff call 1480 <_end_pagebuf_page_io> +} + 15e5: 83 c4 0c add $0xc,%esp + 15e8: c3 ret + 15e9: 8d 76 00 lea 0x0(%esi),%esi + +00000000000015ec <_end_io_nolock>: + +STATIC void +_end_io_nolock( + struct buffer_head *bh, + int uptodate) +{ + 15ec: 8b 54 24 04 mov 0x4(%esp,1),%edx + 15f0: 8b 44 24 08 mov 0x8(%esp,1),%eax + _end_pagebuf_page_io(bh, uptodate, 0); + 15f4: 6a 00 push $0x0 + 15f6: 50 push %eax + 15f7: 52 push %edx + 15f8: e8 83 fe ff ff call 1480 <_end_pagebuf_page_io> +} + 15fd: 83 c4 0c add $0xc,%esp + 1600: c3 ret + 1601: 8d 76 00 lea 0x0(%esi),%esi + +0000000000001604 <_end_pagebuf_page_io_multi>: + +typedef struct { + page_buf_t *pb; /* pointer to pagebuf page is within */ + int locking; /* are pages locked? */ + atomic_t remain; /* count of remaining I/O requests */ +} pagesync_t; + +STATIC void +_end_pagebuf_page_io_multi( + struct buffer_head *bh, + int uptodate, + int fullpage) +{ + 1604: 83 ec 08 sub $0x8,%esp + 1607: 55 push %ebp + 1608: 57 push %edi + 1609: 56 push %esi + 160a: 53 push %ebx + 160b: 8b 5c 24 1c mov 0x1c(%esp,1),%ebx + pagesync_t *psync = (pagesync_t *) bh->b_private; + 160f: 8b 6b 40 mov 0x40(%ebx),%ebp + 1612: 8b 44 24 20 mov 0x20(%esp,1),%eax + page_buf_t *pb = psync->pb; + 1616: 8b 55 00 mov 0x0(%ebp),%edx + 1619: 89 54 24 14 mov %edx,0x14(%esp,1) + * This is called by bh->b_end_io() handlers when I/O has completed. + */ +static inline void mark_buffer_uptodate(struct buffer_head * bh, int on) +{ + if (on) + 161d: 85 c0 test %eax,%eax + 161f: 74 0f je 1630 <_end_pagebuf_page_io_multi+0x2c> + * Note that @nr may be almost arbitrarily large; this function is not + * restricted to acting on a single-word quantity. + */ +static __inline__ void set_bit(int nr, volatile void * addr) +{ + 1621: 31 c0 xor %eax,%eax + __asm__ __volatile__( LOCK_PREFIX + 1623: f0 0f ab 43 18 lock bts %eax,0x18(%ebx) + */ +static inline void mark_buffer_uptodate(struct buffer_head * bh, int on) +{ + if (on) + set_bit(BH_Uptodate, &bh->b_state); + 1628: eb 0b jmp 1635 <_end_pagebuf_page_io_multi+0x31> + 162a: 8d b6 00 00 00 00 lea 0x0(%esi),%esi + * in order to ensure changes are visible on other processors. + */ +static __inline__ void clear_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__( LOCK_PREFIX + 1630: f0 0f b3 43 18 lock btr %eax,0x18(%ebx) + * useful range of an atomic_t is only 24 bits. + */ +static __inline__ void atomic_dec(atomic_t *v) +{ + __asm__ __volatile__( + 1635: f0 ff 4b 10 lock decl 0x10(%ebx) + struct page *page; + + mark_buffer_uptodate(bh, uptodate); + put_bh(bh); + + page = bh->b_page; + 1639: 8b 43 38 mov 0x38(%ebx),%eax + 163c: 89 44 24 10 mov %eax,0x10(%esp,1) +#endif + +static __inline__ int constant_test_bit(int nr, const volatile void * addr) +{ + return ((1UL << (nr & 31)) & (((const volatile unsigned int *) addr)[nr >> 5])) != 0; + 1640: 8b 43 18 mov 0x18(%ebx),%eax + if (!test_bit(BH_Uptodate, &bh->b_state)) { + 1643: a8 01 test $0x1,%al + 1645: 75 18 jne 165f <_end_pagebuf_page_io_multi+0x5b> + * restricted to acting on a single-word quantity. + */ +static __inline__ void set_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__( LOCK_PREFIX + 1647: 8b 54 24 10 mov 0x10(%esp,1),%edx + 164b: b8 01 00 00 00 mov $0x1,%eax + 1650: f0 0f ab 42 18 lock bts %eax,0x18(%edx) + set_bit(PG_error, &page->flags); + pb->pb_error = EIO; + 1655: 8b 44 24 14 mov 0x14(%esp,1),%eax + 1659: 66 c7 40 7c 05 00 movw $0x5,0x7c(%eax) + } + + unlock_buffer(bh); + 165f: 89 d8 mov %ebx,%eax + 1661: e8 fc ff ff ff call 1662 <_end_pagebuf_page_io_multi+0x5e> + if (fullpage) + 1666: 83 7c 24 24 00 cmpl $0x0,0x24(%esp,1) + 166b: 0f 84 bf 00 00 00 je 1730 <_end_pagebuf_page_io_multi+0x12c> + 1671: 83 3d 00 00 00 00 40 cmpl $0x40,0x0 + 1678: 75 16 jne 1690 <_end_pagebuf_page_io_multi+0x8c> + 167a: a1 00 00 00 00 mov 0x0,%eax + 167f: 53 push %ebx + 1680: 50 push %eax + 1681: e8 fc ff ff ff call 1682 <_end_pagebuf_page_io_multi+0x7e> + 1686: 83 c4 08 add $0x8,%esp + 1689: e9 a2 00 00 00 jmp 1730 <_end_pagebuf_page_io_multi+0x12c> + 168e: 89 f6 mov %esi,%esi + 1690: 9c pushf + 1691: 5f pop %edi + 1692: fa cli + return oldval > 0; +} + +static inline void spin_lock(spinlock_t *lock) +{ + 1693: be 00 00 00 00 mov $0x0,%esi +#if SPINLOCK_DEBUG + __label__ here; +here: + if (lock->magic != SPINLOCK_MAGIC) { + 1698: 81 3d 04 00 00 00 ad cmpl $0xdead4ead,0x4 + 169f: 4e ad de + 16a2: 74 1a je 16be <_end_pagebuf_page_io_multi+0xba> +printk("eip: %p\n", &&here); + 16a4: 68 98 16 00 00 push $0x1698 + 16a9: 68 2b 00 00 00 push $0x2b + 16ae: e8 fc ff ff ff call 16af <_end_pagebuf_page_io_multi+0xab> + BUG(); + 16b3: 0f 0b ud2a + 16b5: 85 00 test %eax,(%eax) + 16b7: 00 00 add %al,(%eax) + 16b9: 00 00 add %al,(%eax) + } + 16bb: 83 c4 08 add $0x8,%esp +#endif + __asm__ __volatile__( + 16be: f0 fe 0e lock decb (%esi) + 16c1: 0f 88 a0 17 00 00 js 2e67 + 16c7: c7 43 30 00 00 00 00 movl $0x0,0x30(%ebx) + 16ce: a1 00 00 00 00 mov 0x0,%eax + 16d3: 89 03 mov %eax,(%ebx) + 16d5: 89 1d 00 00 00 00 mov %ebx,0x0 + 16db: ff 05 00 00 00 00 incl 0x0 + 16e1: 81 3d 48 03 00 00 48 cmpl $0x348,0x348 + 16e8: 03 00 00 + 16eb: 74 14 je 1701 <_end_pagebuf_page_io_multi+0xfd> + 16ed: b9 01 00 00 00 mov $0x1,%ecx + 16f2: ba 03 00 00 00 mov $0x3,%edx + 16f7: b8 40 03 00 00 mov $0x340,%eax + 16fc: e8 fc ff ff ff call 16fd <_end_pagebuf_page_io_multi+0xf9> + :"0" (oldval) : "memory" + +static inline void spin_unlock(spinlock_t *lock) +{ + char oldval = 1; + 1701: b2 01 mov $0x1,%dl +#if SPINLOCK_DEBUG + if (lock->magic != SPINLOCK_MAGIC) + 1703: 81 3d 04 00 00 00 ad cmpl $0xdead4ead,0x4 + 170a: 4e ad de + 170d: 74 08 je 1717 <_end_pagebuf_page_io_multi+0x113> + BUG(); + 170f: 0f 0b ud2a + 1711: 69 00 00 00 00 00 imul $0x0,(%eax),%eax + if (!spin_is_locked(lock)) + 1717: a0 00 00 00 00 mov 0x0,%al + 171c: 84 c0 test %al,%al + 171e: 7e 08 jle 1728 <_end_pagebuf_page_io_multi+0x124> + BUG(); + 1720: 0f 0b ud2a + 1722: 6b 00 00 imul $0x0,(%eax),%eax + 1725: 00 00 add %al,(%eax) + 1727: 00 86 15 00 00 00 add %al,0x15(%esi) +#endif + __asm__ __volatile__( + 172d: 00 57 9d add %dl,0xffffff9d(%edi) +static __inline__ int atomic_dec_and_test(atomic_t *v) +{ + unsigned char c; + + __asm__ __volatile__( + 1730: f0 ff 4d 08 lock decl 0x8(%ebp) + 1734: 0f 94 c0 sete %al + _pagebuf_free_bh(bh); + + if (atomic_dec_and_test(&psync->remain) == 1) { + 1737: 84 c0 test %al,%al + 1739: 74 4f je 178a <_end_pagebuf_page_io_multi+0x186> + if (fullpage) + 173b: 83 7c 24 24 00 cmpl $0x0,0x24(%esp,1) + 1740: 74 0e je 1750 <_end_pagebuf_page_io_multi+0x14c> + * restricted to acting on a single-word quantity. + */ +static __inline__ void set_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__( LOCK_PREFIX + 1742: 8b 54 24 10 mov 0x10(%esp,1),%edx + 1746: b8 03 00 00 00 mov $0x3,%eax + 174b: f0 0f ab 42 18 lock bts %eax,0x18(%edx) + SetPageUptodate(page); + if (psync->locking) + 1750: 83 7d 04 00 cmpl $0x0,0x4(%ebp) + 1754: 74 09 je 175f <_end_pagebuf_page_io_multi+0x15b> + unlock_page(page); + 1756: 8b 44 24 10 mov 0x10(%esp,1),%eax + 175a: e8 fc ff ff ff call 175b <_end_pagebuf_page_io_multi+0x157> + kfree(psync); + 175f: 55 push %ebp + 1760: e8 fc ff ff ff call 1761 <_end_pagebuf_page_io_multi+0x15d> + 1765: 83 c4 04 add $0x4,%esp +static __inline__ int atomic_dec_and_test(atomic_t *v) +{ + unsigned char c; + + __asm__ __volatile__( + 1768: 8b 54 24 14 mov 0x14(%esp,1),%edx + 176c: f0 ff 8a b4 00 00 00 lock decl 0xb4(%edx) + 1773: 0f 94 c0 sete %al + 1776: 84 c0 test %al,%al + 1778: 74 10 je 178a <_end_pagebuf_page_io_multi+0x186> + 177a: c6 82 82 00 00 00 00 movb $0x0,0x82(%edx) + 1781: 52 push %edx + 1782: e8 fc ff ff ff call 1783 <_end_pagebuf_page_io_multi+0x17f> + 1787: 83 c4 04 add $0x4,%esp + _pb_io_done(pb); + } + 178a: 5b pop %ebx + 178b: 5e pop %esi + 178c: 5f pop %edi + 178d: 5d pop %ebp + 178e: 59 pop %ecx + 178f: 5a pop %edx + 1790: c3 ret +} + 1791: 8d 76 00 lea 0x0(%esi),%esi + +0000000000001794 <_end_io_multi_full>: + +STATIC void +_end_io_multi_full( + struct buffer_head *bh, + int uptodate) +{ + 1794: 8b 54 24 04 mov 0x4(%esp,1),%edx + 1798: 8b 44 24 08 mov 0x8(%esp,1),%eax + _end_pagebuf_page_io_multi(bh, uptodate, 1); + 179c: 6a 01 push $0x1 + 179e: 50 push %eax + 179f: 52 push %edx + 17a0: e8 5f fe ff ff call 1604 <_end_pagebuf_page_io_multi> +} + 17a5: 83 c4 0c add $0xc,%esp + 17a8: c3 ret + 17a9: 8d 76 00 lea 0x0(%esi),%esi + +00000000000017ac <_end_io_multi_part>: + +STATIC void +_end_io_multi_part( + struct buffer_head *bh, + int uptodate) +{ + 17ac: 8b 54 24 04 mov 0x4(%esp,1),%edx + 17b0: 8b 44 24 08 mov 0x8(%esp,1),%eax + _end_pagebuf_page_io_multi(bh, uptodate, 0); + 17b4: 6a 00 push $0x0 + 17b6: 50 push %eax + 17b7: 52 push %edx + 17b8: e8 47 fe ff ff call 1604 <_end_pagebuf_page_io_multi> +} + 17bd: 83 c4 0c add $0xc,%esp + 17c0: c3 ret + 17c1: 8d 76 00 lea 0x0(%esi),%esi + +00000000000017c4 <_pagebuf_page_io>: + + +/* + * Initiate I/O on part of a page we are interested in + */ +STATIC int +_pagebuf_page_io( + struct page *page, /* Page structure we are dealing with */ + page_buf_t *pb, /* pagebuf holding it, can be NULL */ + page_buf_daddr_t bn, /* starting block number */ + kdev_t dev, /* device for I/O */ + size_t blocksize, /* filesystem block size */ + off_t pg_offset, /* starting offset in page */ + size_t pg_length, /* count of data to process */ + int locking, /* page locking in use */ + int rw, /* read/write operation */ + int flush) +{ + 17c4: 83 ec 44 sub $0x44,%esp + 17c7: 55 push %ebp + 17c8: 57 push %edi + 17c9: 56 push %esi + 17ca: 53 push %ebx + 17cb: 8b 44 24 68 mov 0x68(%esp,1),%eax + 17cf: 66 89 44 24 32 mov %ax,0x32(%esp,1) + size_t sector; + size_t blk_length = 0; + struct buffer_head *bh, *head, *bufferlist[MAX_BUF_PER_PAGE]; + int multi_ok; + int i = 0, cnt = 0, err = 0; + 17d4: c7 44 24 24 00 00 00 movl $0x0,0x24(%esp,1) + 17db: 00 + 17dc: c7 44 24 20 00 00 00 movl $0x0,0x20(%esp,1) + 17e3: 00 + int public_bh = 0; + 17e4: c7 44 24 1c 00 00 00 movl $0x0,0x1c(%esp,1) + 17eb: 00 + + if ((blocksize < PAGE_CACHE_SIZE) && + 17ec: 81 7c 24 6c ff 0f 00 cmpl $0xfff,0x6c(%esp,1) + 17f3: 00 + 17f4: 0f 87 69 01 00 00 ja 1963 <_pagebuf_page_io+0x19f> + 17fa: 8b 54 24 5c mov 0x5c(%esp,1),%edx + 17fe: 8b 42 08 mov 0x8(%edx),%eax + 1801: a9 00 00 10 00 test $0x100000,%eax + 1806: 0f 85 57 01 00 00 jne 1963 <_pagebuf_page_io+0x19f> + !(pb->pb_flags & _PBF_PRIVATE_BH)) { + int cache_ok; + + cache_ok = !((pb->pb_flags & PBF_FORCEIO) || (rw == WRITE)); + 180c: c7 44 24 18 00 00 00 movl $0x0,0x18(%esp,1) + 1813: 00 + 1814: a9 00 00 00 08 test $0x8000000,%eax + 1819: 75 1a jne 1835 <_pagebuf_page_io+0x71> + 181b: c7 44 24 18 01 00 00 movl $0x1,0x18(%esp,1) + 1822: 00 + 1823: 8b 4c 24 18 mov 0x18(%esp,1),%ecx + 1827: 83 7c 24 7c 01 cmpl $0x1,0x7c(%esp,1) + 182c: 0f 44 4c 24 24 cmove 0x24(%esp,1),%ecx + 1831: 89 4c 24 18 mov %ecx,0x18(%esp,1) + public_bh = multi_ok = 1; + + if (!page_has_buffers(page)) { + 1835: 8b 7c 24 58 mov 0x58(%esp,1),%edi + 1839: c7 44 24 28 01 00 00 movl $0x1,0x28(%esp,1) + 1840: 00 + 1841: c7 44 24 1c 01 00 00 movl $0x1,0x1c(%esp,1) + 1848: 00 + 1849: 83 7f 28 00 cmpl $0x0,0x28(%edi) + 184d: 75 4b jne 189a <_pagebuf_page_io+0xd6> + if (!locking) { + 184f: 83 7c 24 78 00 cmpl $0x0,0x78(%esp,1) + 1854: 75 2c jne 1882 <_pagebuf_page_io+0xbe> + lock_page(page); + 1856: 89 f8 mov %edi,%eax + 1858: e8 fc ff ff ff call 1859 <_pagebuf_page_io+0x95> + if (!page_has_buffers(page)) { + 185d: 83 7f 28 00 cmpl $0x0,0x28(%edi) + 1861: 75 14 jne 1877 <_pagebuf_page_io+0xb3> + create_empty_buffers(page, dev, + 1863: 68 00 02 00 00 push $0x200 + 1868: 0f b7 44 24 36 movzwl 0x36(%esp,1),%eax + 186d: 50 push %eax + 186e: 57 push %edi + 186f: e8 fc ff ff ff call 1870 <_pagebuf_page_io+0xac> + SECTOR_SIZE); + } + 1874: 83 c4 0c add $0xc,%esp + unlock_page(page); + 1877: 8b 44 24 58 mov 0x58(%esp,1),%eax + 187b: e8 fc ff ff ff call 187c <_pagebuf_page_io+0xb8> + } else { + 1880: eb 18 jmp 189a <_pagebuf_page_io+0xd6> + create_empty_buffers(page, dev, SECTOR_SIZE); + 1882: 68 00 02 00 00 push $0x200 + 1887: 0f b7 44 24 36 movzwl 0x36(%esp,1),%eax + 188c: 50 push %eax + 188d: 8b 44 24 60 mov 0x60(%esp,1),%eax + 1891: 50 push %eax + 1892: e8 fc ff ff ff call 1893 <_pagebuf_page_io+0xcf> + } + 1897: 83 c4 0c add $0xc,%esp + } + + /* Find buffer_heads belonging to just this pagebuf */ + bh = head = page_buffers(page); + 189a: 8b 54 24 58 mov 0x58(%esp,1),%edx + 189e: 8b 52 28 mov 0x28(%edx),%edx + 18a1: 89 54 24 10 mov %edx,0x10(%esp,1) + 18a5: 89 54 24 2c mov %edx,0x2c(%esp,1) + 18a9: 8d b4 26 00 00 00 00 lea 0x0(%esi,1),%esi + do { + if (buffer_uptodate(bh) && cache_ok) + 18b0: 8b 4c 24 10 mov 0x10(%esp,1),%ecx + 18b4: f6 41 18 01 testb $0x1,0x18(%ecx) + 18b8: 74 0b je 18c5 <_pagebuf_page_io+0x101> + 18ba: 83 7c 24 18 00 cmpl $0x0,0x18(%esp,1) + 18bf: 0f 85 7e 00 00 00 jne 1943 <_pagebuf_page_io+0x17f> + continue; + blk_length = i << SECTOR_SHIFT; + 18c5: 8b 5c 24 24 mov 0x24(%esp,1),%ebx + 18c9: c1 e3 09 shl $0x9,%ebx + if (blk_length < pg_offset) + 18cc: 3b 5c 24 70 cmp 0x70(%esp,1),%ebx + 18d0: 72 71 jb 1943 <_pagebuf_page_io+0x17f> + continue; + if (blk_length >= pg_offset + pg_length) + 18d2: 8b 44 24 70 mov 0x70(%esp,1),%eax + 18d6: 03 44 24 74 add 0x74(%esp,1),%eax + 18da: 39 c3 cmp %eax,%ebx + 18dc: 0f 83 f0 01 00 00 jae 1ad2 <_pagebuf_page_io+0x30e> +} + +static inline void lock_buffer(struct buffer_head * bh) +{ + while (test_and_set_bit(BH_Lock, &bh->b_state)) + 18e2: 8b 5c 24 70 mov 0x70(%esp,1),%ebx + 18e6: 8b 7c 24 20 mov 0x20(%esp,1),%edi + 18ea: 8b 74 24 20 mov 0x20(%esp,1),%esi + 18ee: c1 fb 09 sar $0x9,%ebx + 18f1: c1 e7 02 shl $0x2,%edi + 18f4: 89 7c 24 14 mov %edi,0x14(%esp,1) + 18f8: 8d 6c 24 34 lea 0x34(%esp,1),%ebp + 18fc: 46 inc %esi + 18fd: eb 0e jmp 190d <_pagebuf_page_io+0x149> + 18ff: 90 nop + __wait_on_buffer(bh); + 1900: 8b 44 24 10 mov 0x10(%esp,1),%eax + 1904: 50 push %eax + 1905: e8 fc ff ff ff call 1906 <_pagebuf_page_io+0x142> + 190a: 83 c4 04 add $0x4,%esp +static __inline__ int test_and_set_bit(int nr, volatile void * addr) +{ + int oldbit; + + __asm__ __volatile__( LOCK_PREFIX + 190d: 8b 54 24 10 mov 0x10(%esp,1),%edx + 1911: b8 02 00 00 00 mov $0x2,%eax + 1916: f0 0f ab 42 18 lock bts %eax,0x18(%edx) + 191b: 19 c0 sbb %eax,%eax +} + +static inline void lock_buffer(struct buffer_head * bh) +{ + while (test_and_set_bit(BH_Lock, &bh->b_state)) + 191d: 85 c0 test %eax,%eax + 191f: 75 df jne 1900 <_pagebuf_page_io+0x13c> + * useful range of an atomic_t is only 24 bits. + */ +static __inline__ void atomic_inc(atomic_t *v) +{ + __asm__ __volatile__( + 1921: f0 ff 42 10 lock incl 0x10(%edx) + break; + + lock_buffer(bh); + get_bh(bh); + assert(!waitqueue_active(&bh->b_wait)); + + bh->b_size = SECTOR_SIZE; + 1925: 66 c7 42 08 00 02 movw $0x200,0x8(%edx) + bh->b_blocknr = bn + (i - (pg_offset >> SECTOR_SHIFT)); + 192b: 8b 44 24 24 mov 0x24(%esp,1),%eax + 192f: 29 d8 sub %ebx,%eax + 1931: 03 44 24 60 add 0x60(%esp,1),%eax + 1935: 89 42 04 mov %eax,0x4(%edx) + bufferlist[cnt++] = bh; + 1938: 8b 4c 24 14 mov 0x14(%esp,1),%ecx + 193c: 89 14 29 mov %edx,(%ecx,%ebp,1) + 193f: 89 74 24 20 mov %esi,0x20(%esp,1) + } while (i++, (bh = bh->b_this_page) != head); + 1943: 8b 7c 24 10 mov 0x10(%esp,1),%edi + 1947: 8b 44 24 2c mov 0x2c(%esp,1),%eax + 194b: ff 44 24 24 incl 0x24(%esp,1) + 194f: 8b 7f 28 mov 0x28(%edi),%edi + 1952: 89 7c 24 10 mov %edi,0x10(%esp,1) + 1956: 39 c7 cmp %eax,%edi + 1958: 0f 85 52 ff ff ff jne 18b0 <_pagebuf_page_io+0xec> + + goto request; + 195e: e9 6f 01 00 00 jmp 1ad2 <_pagebuf_page_io+0x30e> + } + + /* Calculate the block offsets and length we will be using */ + if (pg_offset) { + 1963: 83 7c 24 70 00 cmpl $0x0,0x70(%esp,1) + 1968: 74 1d je 1987 <_pagebuf_page_io+0x1c3> + size_t block_offset; + + block_offset = pg_offset >> SECTOR_SHIFT; + block_offset = pg_offset - (block_offset << SECTOR_SHIFT); + 196a: 8b 54 24 70 mov 0x70(%esp,1),%edx + 196e: 8b 44 24 70 mov 0x70(%esp,1),%eax + 1972: 81 e2 00 fe ff ff and $0xfffffe00,%edx + 1978: 29 d0 sub %edx,%eax + blk_length = (pg_length + block_offset + SECTOR_MASK) >> + 197a: 8b 54 24 74 mov 0x74(%esp,1),%edx + 197e: 8d 9c 10 ff 01 00 00 lea 0x1ff(%eax,%edx,1),%ebx + SECTOR_SHIFT; + } else { + 1985: eb 0a jmp 1991 <_pagebuf_page_io+0x1cd> + blk_length = (pg_length + SECTOR_MASK) >> SECTOR_SHIFT; + 1987: 8b 5c 24 74 mov 0x74(%esp,1),%ebx + 198b: 81 c3 ff 01 00 00 add $0x1ff,%ebx + 1991: c1 eb 09 shr $0x9,%ebx + } + + /* This will attempt to make a request bigger than the sector + * size if we are well aligned. + */ + switch (pb->pb_target->pbr_flags) { + 1994: 8b 4c 24 5c mov 0x5c(%esp,1),%ecx + 1998: 8b 41 14 mov 0x14(%ecx),%eax + 199b: 8b 00 mov (%eax),%eax + 199d: 83 f8 01 cmp $0x1,%eax + 19a0: 74 3e je 19e0 <_pagebuf_page_io+0x21c> + 19a2: 7f 0c jg 19b0 <_pagebuf_page_io+0x1ec> + 19a4: 85 c0 test %eax,%eax + 19a6: 74 2a je 19d2 <_pagebuf_page_io+0x20e> + 19a8: eb 36 jmp 19e0 <_pagebuf_page_io+0x21c> + 19aa: 8d b6 00 00 00 00 lea 0x0(%esi),%esi + 19b0: 83 f8 02 cmp $0x2,%eax + 19b3: 75 2b jne 19e0 <_pagebuf_page_io+0x21c> + case 0: + sector = blk_length << SECTOR_SHIFT; + blk_length = 1; + break; + case PBR_ALIGNED_ONLY: + if ((pg_offset == 0) && (pg_length == PAGE_CACHE_SIZE) && + 19b5: 83 7c 24 70 00 cmpl $0x0,0x70(%esp,1) + 19ba: 75 24 jne 19e0 <_pagebuf_page_io+0x21c> + 19bc: 81 7c 24 74 00 10 00 cmpl $0x1000,0x74(%esp,1) + 19c3: 00 + 19c4: 75 1a jne 19e0 <_pagebuf_page_io+0x21c> + 19c6: 8b 7c 24 60 mov 0x60(%esp,1),%edi + 19ca: f7 c7 07 00 00 00 test $0x7,%edi + 19d0: 75 0e jne 19e0 <_pagebuf_page_io+0x21c> + (((unsigned int) bn) & BN_ALIGN_MASK) == 0) { + sector = blk_length << SECTOR_SHIFT; + 19d2: 89 de mov %ebx,%esi + 19d4: c1 e6 09 shl $0x9,%esi + blk_length = 1; + 19d7: bb 01 00 00 00 mov $0x1,%ebx + break; + 19dc: eb 0c jmp 19ea <_pagebuf_page_io+0x226> + 19de: 89 f6 mov %esi,%esi + } + case PBR_SECTOR_ONLY: + /* Fallthrough, same as default */ + default: + sector = SECTOR_SIZE; + 19e0: be 00 02 00 00 mov $0x200,%esi + } + + /* The b_size field of struct buffer_head is an unsigned short + * ... we may need to split this request up. [64K is too big] + */ + assert(sizeof(bh->b_size) == 2); + 19e5: eb 03 jmp 19ea <_pagebuf_page_io+0x226> + while (sector > 0xffff) { + sector >>= 1; + 19e7: d1 ee shr %esi + blk_length++; + 19e9: 43 inc %ebx + } + 19ea: 81 fe ff ff 00 00 cmp $0xffff,%esi + 19f0: 77 f5 ja 19e7 <_pagebuf_page_io+0x223> + + multi_ok = (blk_length != 1); + 19f2: 83 fb 01 cmp $0x1,%ebx + 19f5: 0f 95 c0 setne %al + 19f8: 0f b6 c0 movzbl %al,%eax + 19fb: 89 44 24 28 mov %eax,0x28(%esp,1) + + for (; blk_length > 0; blk_length--, pg_offset += sector) { + 19ff: 85 db test %ebx,%ebx + 1a01: 0f 84 cb 00 00 00 je 1ad2 <_pagebuf_page_io+0x30e> + 1a07: 31 ed xor %ebp,%ebp + 1a09: 8d b4 26 00 00 00 00 lea 0x0(%esi,1),%esi + bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS); + 1a10: a1 00 00 00 00 mov 0x0,%eax + 1a15: 68 f0 00 00 00 push $0xf0 + 1a1a: 50 push %eax + 1a1b: e8 fc ff ff ff call 1a1c <_pagebuf_page_io+0x258> + 1a20: 89 44 24 18 mov %eax,0x18(%esp,1) + if (!bh) { + 1a24: 83 c4 08 add $0x8,%esp + 1a27: 85 c0 test %eax,%eax + 1a29: 75 11 jne 1a3c <_pagebuf_page_io+0x278> + bh = _pagebuf_get_prealloc_bh(); + 1a2b: e8 90 ed ff ff call 7c0 <_pagebuf_get_prealloc_bh> + 1a30: 89 44 24 10 mov %eax,0x10(%esp,1) + if (!bh) { + 1a34: 85 c0 test %eax,%eax + 1a36: 0f 84 d4 01 00 00 je 1c10 <_pagebuf_page_io+0x44c> + : "memory") +{ + int d0, d1; + switch (count % 4) { + case 0: COMMON(""); return s; + 1a3c: 8b 7c 24 10 mov 0x10(%esp,1),%edi + 1a40: b9 19 00 00 00 mov $0x19,%ecx + 1a45: 89 e8 mov %ebp,%eax + 1a47: f3 ab repz stos %eax,%es:(%edi) + /* This should never happen */ + err = -ENOMEM; + goto error; + } + } + memset(bh, 0, sizeof(*bh)); + bh->b_size = sector; + 1a49: 8b 44 24 10 mov 0x10(%esp,1),%eax + 1a4d: 66 89 70 08 mov %si,0x8(%eax) + bh->b_blocknr = bn++; + 1a51: 8b 54 24 60 mov 0x60(%esp,1),%edx + 1a55: 89 50 04 mov %edx,0x4(%eax) + bh->b_dev = dev; + 1a58: 0f b7 4c 24 32 movzwl 0x32(%esp,1),%ecx + 1a5d: 83 44 24 60 01 addl $0x1,0x60(%esp,1) + 1a62: 83 54 24 64 00 adcl $0x0,0x64(%esp,1) + 1a67: 66 89 48 0c mov %cx,0xc(%eax) + * restricted to acting on a single-word quantity. + */ +static __inline__ void set_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__( LOCK_PREFIX + 1a6b: 8b 7c 24 10 mov 0x10(%esp,1),%edi + 1a6f: b8 02 00 00 00 mov $0x2,%eax + 1a74: f0 0f ab 47 18 lock bts %eax,0x18(%edi) + set_bit(BH_Lock, &bh->b_state); + set_bh_page(bh, page, pg_offset); + 1a79: 8b 44 24 70 mov 0x70(%esp,1),%eax + 1a7d: 50 push %eax + 1a7e: 8b 54 24 5c mov 0x5c(%esp,1),%edx + 1a82: 52 push %edx + 1a83: 57 push %edi + 1a84: e8 fc ff ff ff call 1a85 <_pagebuf_page_io+0x2c1> +#define DECLARE_WAIT_QUEUE_HEAD(name) \ + wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name) + +static inline void init_waitqueue_head(wait_queue_head_t *q) +{ + 1a89: 89 f9 mov %edi,%ecx + 1a8b: 83 c4 0c add $0xc,%esp +#if WAITQUEUE_DEBUG + if (!q) + WQ_BUG(); +#endif + q->lock = WAITQUEUE_RW_LOCK_UNLOCKED; + 1a8e: 8b 7c 24 10 mov 0x10(%esp,1),%edi + 1a92: 83 c1 48 add $0x48,%ecx + 1a95: b8 01 00 00 00 mov $0x1,%eax + 1a9a: ba ad 4e ad de mov $0xdead4ead,%edx + 1a9f: 89 47 48 mov %eax,0x48(%edi) + 1aa2: 89 57 4c mov %edx,0x4c(%edi) + INIT_LIST_HEAD(&q->task_list); + 1aa5: 89 f8 mov %edi,%eax + 1aa7: 83 c0 50 add $0x50,%eax + 1aaa: 89 41 08 mov %eax,0x8(%ecx) + 1aad: 89 41 0c mov %eax,0xc(%ecx) + init_waitqueue_head(&bh->b_wait); + atomic_set(&bh->b_count, 1); + 1ab0: c7 47 10 01 00 00 00 movl $0x1,0x10(%edi) + bufferlist[cnt++] = bh; + 1ab7: 8b 54 24 20 mov 0x20(%esp,1),%edx + 1abb: 8d 44 24 34 lea 0x34(%esp,1),%eax + 1abf: 89 3c 90 mov %edi,(%eax,%edx,4) + 1ac2: 42 inc %edx + 1ac3: 89 54 24 20 mov %edx,0x20(%esp,1) + 1ac7: 01 74 24 70 add %esi,0x70(%esp,1) + 1acb: 4b dec %ebx + 1acc: 0f 85 3e ff ff ff jne 1a10 <_pagebuf_page_io+0x24c> + } + +request: + if (cnt) { + 1ad2: 83 7c 24 20 00 cmpl $0x0,0x20(%esp,1) + 1ad7: 0f 84 1a 01 00 00 je 1bf7 <_pagebuf_page_io+0x433> + pagesync_t *psync = NULL; + 1add: 31 db xor %ebx,%ebx + void (*callback)(struct buffer_head *, int); + + if (multi_ok) { + 1adf: 83 7c 24 28 00 cmpl $0x0,0x28(%esp,1) + 1ae4: 74 4a je 1b30 <_pagebuf_page_io+0x36c> + size_t size = sizeof(pagesync_t); + + psync = (pagesync_t *) kmalloc(size, GFP_NOFS); + 1ae6: 68 f0 00 00 00 push $0xf0 + 1aeb: 6a 0c push $0xc + 1aed: e8 fc ff ff ff call 1aee <_pagebuf_page_io+0x32a> + 1af2: 89 c3 mov %eax,%ebx + if (!psync) + 1af4: 83 c4 08 add $0x8,%esp + 1af7: 85 db test %ebx,%ebx + 1af9: 75 08 jne 1b03 <_pagebuf_page_io+0x33f> + BUG(); /* Ugh - out of memory condition here */ + 1afb: 0f 0b ud2a + 1afd: 78 06 js 1b05 <_pagebuf_page_io+0x341> + 1aff: 34 00 xor $0x0,%al + 1b01: 00 00 add %al,(%eax) + psync->pb = pb; + 1b03: 8b 4c 24 5c mov 0x5c(%esp,1),%ecx + 1b07: 89 0b mov %ecx,(%ebx) + psync->locking = locking; + 1b09: 8b 7c 24 78 mov 0x78(%esp,1),%edi + 1b0d: 89 7b 04 mov %edi,0x4(%ebx) + atomic_set(&psync->remain, 0); + 1b10: c7 43 08 00 00 00 00 movl $0x0,0x8(%ebx) + + callback = public_bh ? + 1b17: be 94 17 00 00 mov $0x1794,%esi + 1b1c: b8 ac 17 00 00 mov $0x17ac,%eax + 1b21: 83 7c 24 1c 00 cmpl $0x0,0x1c(%esp,1) + 1b26: 0f 45 f0 cmovne %eax,%esi + _end_io_multi_part : _end_io_multi_full; + } else { + 1b29: eb 17 jmp 1b42 <_pagebuf_page_io+0x37e> + 1b2b: 90 nop + 1b2c: 8d 74 26 00 lea 0x0(%esi,1),%esi + callback = locking ? _end_io_locked : _end_io_nolock; + 1b30: be ec 15 00 00 mov $0x15ec,%esi + 1b35: b8 d4 15 00 00 mov $0x15d4,%eax + 1b3a: 83 7c 24 78 00 cmpl $0x0,0x78(%esp,1) + 1b3f: 0f 45 f0 cmovne %eax,%esi + * useful range of an atomic_t is only 24 bits. + */ +static __inline__ void atomic_inc(atomic_t *v) +{ + __asm__ __volatile__( + 1b42: 8b 44 24 5c mov 0x5c(%esp,1),%eax + 1b46: f0 ff 80 b4 00 00 00 lock incl 0xb4(%eax) + } + + /* Indicate that there is another page in progress */ + atomic_inc(&PBP(pb)->pb_io_remaining); + +#ifdef RQ_WRITE_ORDERED + if (flush) + set_bit(BH_Ordered_Flush, &bufferlist[cnt-1]->b_state); +#endif + + for (i = 0; i < cnt; i++) { + 1b4d: 8b 54 24 20 mov 0x20(%esp,1),%edx + 1b51: c7 44 24 24 00 00 00 movl $0x0,0x24(%esp,1) + 1b58: 00 + 1b59: 39 54 24 24 cmp %edx,0x24(%esp,1) + 1b5d: 0f 8d a4 00 00 00 jge 1c07 <_pagebuf_page_io+0x443> + bh = bufferlist[i]; + 1b63: 8b 4c 24 24 mov 0x24(%esp,1),%ecx + 1b67: 8d 44 24 34 lea 0x34(%esp,1),%eax + 1b6b: 8b 04 88 mov (%eax,%ecx,4),%eax + 1b6e: 89 44 24 10 mov %eax,0x10(%esp,1) + + /* Complete the buffer_head, then submit the IO */ + if (psync) { + 1b72: 85 db test %ebx,%ebx + 1b74: 74 11 je 1b87 <_pagebuf_page_io+0x3c3> + init_buffer(bh, callback, psync); + 1b76: 53 push %ebx + 1b77: 56 push %esi + 1b78: 50 push %eax + 1b79: e8 fc ff ff ff call 1b7a <_pagebuf_page_io+0x3b6> + * Atomically increments @v by 1. Note that the guaranteed + * useful range of an atomic_t is only 24 bits. + */ +static __inline__ void atomic_inc(atomic_t *v) +{ + 1b7e: 83 c4 0c add $0xc,%esp + __asm__ __volatile__( + 1b81: f0 ff 43 08 lock incl 0x8(%ebx) + atomic_inc(&psync->remain); + } else { + 1b85: eb 13 jmp 1b9a <_pagebuf_page_io+0x3d6> + init_buffer(bh, callback, pb); + 1b87: 8b 7c 24 5c mov 0x5c(%esp,1),%edi + 1b8b: 57 push %edi + 1b8c: 56 push %esi + 1b8d: 8b 44 24 18 mov 0x18(%esp,1),%eax + 1b91: 50 push %eax + 1b92: e8 fc ff ff ff call 1b93 <_pagebuf_page_io+0x3cf> + } + 1b97: 83 c4 0c add $0xc,%esp + + bh->b_rdev = bh->b_dev; + 1b9a: 8b 54 24 10 mov 0x10(%esp,1),%edx + 1b9e: 0f b7 42 0c movzwl 0xc(%edx),%eax + 1ba2: 66 89 42 14 mov %ax,0x14(%edx) + bh->b_rsector = bh->b_blocknr; + 1ba6: 8b 42 04 mov 0x4(%edx),%eax + 1ba9: 89 42 44 mov %eax,0x44(%edx) + * Note that @nr may be almost arbitrarily large; this function is not + * restricted to acting on a single-word quantity. + */ +static __inline__ void set_bit(int nr, volatile void * addr) +{ + 1bac: b8 04 00 00 00 mov $0x4,%eax + __asm__ __volatile__( LOCK_PREFIX + 1bb1: f0 0f ab 42 18 lock bts %eax,0x18(%edx) + 1bb6: b8 03 00 00 00 mov $0x3,%eax + 1bbb: f0 0f ab 42 18 lock bts %eax,0x18(%edx) + set_bit(BH_Mapped, &bh->b_state); + set_bit(BH_Req, &bh->b_state); + + if (rw == WRITE) { + 1bc0: 83 7c 24 7c 01 cmpl $0x1,0x7c(%esp,1) + 1bc5: 75 07 jne 1bce <_pagebuf_page_io+0x40a> + * Note that @nr may be almost arbitrarily large; this function is not + * restricted to acting on a single-word quantity. + */ +static __inline__ void set_bit(int nr, volatile void * addr) +{ + 1bc7: 31 c0 xor %eax,%eax + __asm__ __volatile__( LOCK_PREFIX + 1bc9: f0 0f ab 42 18 lock bts %eax,0x18(%edx) + set_bit(BH_Uptodate, &bh->b_state); + } + generic_make_request(rw, bh); + 1bce: 8b 4c 24 10 mov 0x10(%esp,1),%ecx + 1bd2: 51 push %ecx + 1bd3: 8b bc 24 80 00 00 00 mov 0x80(%esp,1),%edi + 1bda: 57 push %edi + 1bdb: e8 fc ff ff ff call 1bdc <_pagebuf_page_io+0x418> + 1be0: 83 c4 08 add $0x8,%esp + 1be3: 8b 44 24 20 mov 0x20(%esp,1),%eax + 1be7: ff 44 24 24 incl 0x24(%esp,1) + 1beb: 39 44 24 24 cmp %eax,0x24(%esp,1) + 1bef: 0f 8c 6e ff ff ff jl 1b63 <_pagebuf_page_io+0x39f> + } + } else { + 1bf5: eb 10 jmp 1c07 <_pagebuf_page_io+0x443> + if (locking) + 1bf7: 83 7c 24 78 00 cmpl $0x0,0x78(%esp,1) + 1bfc: 74 09 je 1c07 <_pagebuf_page_io+0x443> + unlock_page(page); + 1bfe: 8b 44 24 58 mov 0x58(%esp,1),%eax + 1c02: e8 fc ff ff ff call 1c03 <_pagebuf_page_io+0x43f> + } + + return err; + 1c07: 31 c0 xor %eax,%eax + 1c09: eb 43 jmp 1c4e <_pagebuf_page_io+0x48a> + 1c0b: 90 nop + 1c0c: 8d 74 26 00 lea 0x0(%esi,1),%esi +error: + /* If we ever do get here then clean up what we already did */ + for (i = 0; i < cnt; i++) { + 1c10: 83 7c 24 20 00 cmpl $0x0,0x20(%esp,1) + 1c15: 7e 32 jle 1c49 <_pagebuf_page_io+0x485> + 1c17: 8b 54 24 20 mov 0x20(%esp,1),%edx + 1c1b: be 01 00 00 00 mov $0x1,%esi + 1c20: 31 db xor %ebx,%ebx + 1c22: 89 54 24 24 mov %edx,0x24(%esp,1) + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ +static __inline__ int test_and_clear_bit(int nr, volatile void * addr) +{ + 1c26: 8b 44 1c 34 mov 0x34(%esp,%ebx,1),%eax + int oldbit; + + __asm__ __volatile__( LOCK_PREFIX + 1c2a: f0 0f b3 70 18 lock btr %esi,0x18(%eax) + 1c2f: 19 d2 sbb %edx,%edx + atomic_set_buffer_clean(bufferlist[i]); + bufferlist[i]->b_end_io(bufferlist[i], 0); + 1c31: 8b 44 1c 34 mov 0x34(%esp,%ebx,1),%eax + 1c35: 6a 00 push $0x0 + 1c37: 50 push %eax + 1c38: 8b 40 3c mov 0x3c(%eax),%eax + 1c3b: ff d0 call *%eax + 1c3d: 83 c4 08 add $0x8,%esp + 1c40: 83 c3 04 add $0x4,%ebx + 1c43: ff 4c 24 24 decl 0x24(%esp,1) + 1c47: 75 dd jne 1c26 <_pagebuf_page_io+0x462> + } + return err; + 1c49: b8 f4 ff ff ff mov $0xfffffff4,%eax + 1c4e: 5b pop %ebx + 1c4f: 5e pop %esi + 1c50: 5f pop %edi + 1c51: 5d pop %ebp + 1c52: 83 c4 44 add $0x44,%esp + 1c55: c3 ret +} + 1c56: 89 f6 mov %esi,%esi + +0000000000001c58 <_page_buf_page_apply>: + +STATIC int +_page_buf_page_apply( + page_buf_t *pb, + loff_t offset, + struct page *page, + size_t pg_offset, + size_t pg_length, + int last) +{ + 1c58: 83 ec 0c sub $0xc,%esp + 1c5b: 55 push %ebp + 1c5c: 57 push %edi + 1c5d: 56 push %esi + 1c5e: 53 push %ebx + 1c5f: 8b 6c 24 20 mov 0x20(%esp,1),%ebp + page_buf_daddr_t bn = pb->pb_bn; + kdev_t dev = pb->pb_target->pbr_kdev; + 1c63: 8b 45 14 mov 0x14(%ebp),%eax + 1c66: 8b 75 1c mov 0x1c(%ebp),%esi + 1c69: 8b 7d 20 mov 0x20(%ebp),%edi + 1c6c: 0f b7 50 06 movzwl 0x6(%eax),%edx + 1c70: 66 89 54 24 1a mov %dx,0x1a(%esp,1) + size_t blocksize = pb->pb_target->pbr_blocksize; + 1c75: 8b 40 10 mov 0x10(%eax),%eax + 1c78: 89 44 24 14 mov %eax,0x14(%esp,1) + loff_t pb_offset; + size_t ret_len = pg_length; + 1c7c: 8b 44 24 34 mov 0x34(%esp,1),%eax + 1c80: 89 44 24 10 mov %eax,0x10(%esp,1) + + assert(page); + + if ((blocksize == PAGE_CACHE_SIZE) && + 1c84: 81 7c 24 14 00 10 00 cmpl $0x1000,0x14(%esp,1) + 1c8b: 00 + 1c8c: 75 42 jne 1cd0 <_page_buf_page_apply+0x78> + 1c8e: 81 7d 2c ff 0f 00 00 cmpl $0xfff,0x2c(%ebp) + 1c95: 77 39 ja 1cd0 <_page_buf_page_apply+0x78> + 1c97: 8b 45 08 mov 0x8(%ebp),%eax + 1c9a: 89 c3 mov %eax,%ebx + 1c9c: f6 c3 01 test $0x1,%bl + 1c9f: 74 2f je 1cd0 <_page_buf_page_apply+0x78> + 1ca1: 80 bd 82 00 00 00 00 cmpb $0x0,0x82(%ebp) + 1ca8: 74 26 je 1cd0 <_page_buf_page_apply+0x78> + (pb->pb_buffer_length < PAGE_CACHE_SIZE) && + (pb->pb_flags & PBF_READ) && pb->pb_locked) { + bn -= (pb->pb_offset >> SECTOR_SHIFT); + 1caa: 0f b7 8d 80 00 00 00 movzwl 0x80(%ebp),%ecx + pg_offset = 0; + pg_length = PAGE_CACHE_SIZE; + 1cb1: 8b 54 24 14 mov 0x14(%esp,1),%edx + 1cb5: 66 c1 e9 09 shr $0x9,%cx + 1cb9: 0f b7 c1 movzwl %cx,%eax + 1cbc: 29 c6 sub %eax,%esi + 1cbe: 83 df 00 sbb $0x0,%edi + 1cc1: c7 44 24 30 00 00 00 movl $0x0,0x30(%esp,1) + 1cc8: 00 + 1cc9: 89 54 24 34 mov %edx,0x34(%esp,1) + } else { + 1ccd: eb 2c jmp 1cfb <_page_buf_page_apply+0xa3> + 1ccf: 90 nop + pb_offset = offset - pb->pb_file_offset; + if (pb_offset) { + 1cd0: 8b 5d 08 mov 0x8(%ebp),%ebx + 1cd3: 8b 54 24 24 mov 0x24(%esp,1),%edx + 1cd7: 8b 4c 24 28 mov 0x28(%esp,1),%ecx + 1cdb: 2b 55 24 sub 0x24(%ebp),%edx + 1cde: 1b 4d 28 sbb 0x28(%ebp),%ecx + 1ce1: 89 d0 mov %edx,%eax + 1ce3: 09 c8 or %ecx,%eax + 1ce5: 74 14 je 1cfb <_page_buf_page_apply+0xa3> + bn += (pb_offset + SECTOR_MASK) >> SECTOR_SHIFT; + 1ce7: 81 c2 ff 01 00 00 add $0x1ff,%edx + 1ced: 83 d1 00 adc $0x0,%ecx + 1cf0: 0f ac ca 09 shrd $0x9,%ecx,%edx + 1cf4: c1 f9 09 sar $0x9,%ecx + 1cf7: 01 d6 add %edx,%esi + 1cf9: 11 cf adc %ecx,%edi + } + } + + if (pb->pb_flags & PBF_READ) { + 1cfb: f6 c3 01 test $0x1,%bl + 1cfe: 74 10 je 1d10 <_page_buf_page_apply+0xb8> + _pagebuf_page_io(page, pb, bn, dev, blocksize, + 1d00: 6a 00 push $0x0 + 1d02: 6a 00 push $0x0 + 1d04: 0f b6 85 82 00 00 00 movzbl 0x82(%ebp),%eax + 1d0b: 50 push %eax + (off_t)pg_offset, pg_length, pb->pb_locked, READ, 0); + } else if (pb->pb_flags & PBF_WRITE) { + 1d0c: eb 44 jmp 1d52 <_page_buf_page_apply+0xfa> + 1d0e: 89 f6 mov %esi,%esi + 1d10: f6 c3 02 test $0x2,%bl + 1d13: 74 62 je 1d77 <_page_buf_page_apply+0x11f> + int locking = (pb->pb_flags & _PBF_LOCKABLE) == 0; + 1d15: c1 eb 13 shr $0x13,%ebx + 1d18: 83 f3 01 xor $0x1,%ebx + 1d1b: 83 e3 01 and $0x1,%ebx + + /* Check we need to lock pages */ + if (locking && (pb->pb_locked == 0)) + 1d1e: 74 12 je 1d32 <_page_buf_page_apply+0xda> + 1d20: 80 bd 82 00 00 00 00 cmpb $0x0,0x82(%ebp) + 1d27: 75 09 jne 1d32 <_page_buf_page_apply+0xda> + lock_page(page); + 1d29: 8b 44 24 2c mov 0x2c(%esp,1),%eax + 1d2d: e8 fc ff ff ff call 1d2e <_page_buf_page_apply+0xd6> + _pagebuf_page_io(page, pb, bn, dev, blocksize, + 1d32: 31 c9 xor %ecx,%ecx + 1d34: 83 7c 24 38 00 cmpl $0x0,0x38(%esp,1) + 1d39: 74 13 je 1d4e <_page_buf_page_apply+0xf6> + 1d3b: 8b 55 08 mov 0x8(%ebp),%edx + 1d3e: 81 e2 00 00 00 10 and $0x10000000,%edx + 1d44: b8 01 00 00 00 mov $0x1,%eax + 1d49: 85 d2 test %edx,%edx + 1d4b: 0f 45 c8 cmovne %eax,%ecx + 1d4e: 51 push %ecx + 1d4f: 6a 01 push $0x1 + 1d51: 53 push %ebx + 1d52: 8b 44 24 40 mov 0x40(%esp,1),%eax + 1d56: 50 push %eax + 1d57: 8b 54 24 40 mov 0x40(%esp,1),%edx + 1d5b: 52 push %edx + 1d5c: 8b 44 24 28 mov 0x28(%esp,1),%eax + 1d60: 50 push %eax + 1d61: 0f b7 44 24 32 movzwl 0x32(%esp,1),%eax + 1d66: 50 push %eax + 1d67: 57 push %edi + 1d68: 56 push %esi + 1d69: 55 push %ebp + 1d6a: 8b 54 24 54 mov 0x54(%esp,1),%edx + 1d6e: 52 push %edx + 1d6f: e8 50 fa ff ff call 17c4 <_pagebuf_page_io> + (off_t)pg_offset, pg_length, locking, WRITE, + last && (pb->pb_flags & PBF_FLUSH)); + } + 1d74: 83 c4 2c add $0x2c,%esp + + return ret_len; + 1d77: 8b 44 24 10 mov 0x10(%esp,1),%eax + 1d7b: 5b pop %ebx + 1d7c: 5e pop %esi + 1d7d: 5f pop %edi + 1d7e: 5d pop %ebp + 1d7f: 83 c4 0c add $0xc,%esp + 1d82: c3 ret +} + 1d83: 90 nop + +0000000000001d84 : + +/* + * pagebuf_iorequest + * + * pagebuf_iorequest is the core I/O request routine. + * It assumes that the buffer is well-formed and + * mapped and ready for physical I/O, unlike + * pagebuf_iostart() and pagebuf_iophysio(). Those + * routines call the pagebuf_ioinitiate routine to start I/O, + * if it is present, or else call pagebuf_iorequest() + * directly if the pagebuf_ioinitiate routine is not present. + * + * This function will be responsible for ensuring access to the + * pages is restricted whilst I/O is in progress - for locking + * pagebufs the pagebuf lock is the mediator, for non-locking + * pagebufs the pages will be locked. In the locking case we + * need to use the pagebuf lock as multiple meta-data buffers + * will reference the same page. + */ +int +pagebuf_iorequest( /* start real I/O */ + page_buf_t *pb) /* buffer to convey to device */ +{ + 1d84: 83 ec 20 sub $0x20,%esp + 1d87: 57 push %edi + 1d88: 56 push %esi + 1d89: 53 push %ebx + 1d8a: 8b 74 24 30 mov 0x30(%esp,1),%esi + int status = 0; + + PB_TRACE(pb, PB_TRACE_REC(ioreq), 0); + + if (pb->pb_flags & PBF_DELWRI) { + 1d8e: 8b 46 08 mov 0x8(%esi),%eax + 1d91: a8 40 test $0x40,%al + 1d93: 74 12 je 1da7 + pagebuf_delwri_queue(pb, 1); + 1d95: 6a 01 push $0x1 + 1d97: 56 push %esi + 1d98: e8 fc ff ff ff call 1d99 + return status; + 1d9d: 31 c0 xor %eax,%eax + 1d9f: 83 c4 08 add $0x8,%esp + 1da2: e9 0f 01 00 00 jmp 1eb6 + } + + if (pb->pb_flags & PBF_WRITE) { + 1da7: a8 02 test $0x2,%al + 1da9: 0f 84 ac 00 00 00 je 1e5b + +static inline struct task_struct * get_current(void) +{ + struct task_struct *current; + __asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL)); + 1daf: b8 00 e0 ff ff mov $0xffffe000,%eax + 1db4: 21 e0 and %esp,%eax + 1db6: c7 44 24 0c 00 00 00 movl $0x0,0xc(%esp,1) + 1dbd: 00 + 1dbe: c7 44 24 10 00 00 00 movl $0x0,0x10(%esp,1) + 1dc5: 00 + 1dc6: c7 44 24 14 00 00 00 movl $0x0,0x14(%esp,1) + 1dcd: 00 + 1dce: c7 44 24 18 00 00 00 movl $0x0,0x18(%esp,1) + 1dd5: 00 + 1dd6: 89 44 24 10 mov %eax,0x10(%esp,1) + 1dda: c7 44 24 1c 00 00 00 movl $0x0,0x1c(%esp,1) + 1de1: 00 + 1de2: 89 44 24 20 mov %eax,0x20(%esp,1) + 1de6: c7 44 24 24 00 00 00 movl $0x0,0x24(%esp,1) + 1ded: 00 + 1dee: c7 44 24 28 00 00 00 movl $0x0,0x28(%esp,1) + 1df5: 00 + 1df6: 8b 86 b8 00 00 00 mov 0xb8(%esi),%eax + 1dfc: 85 c0 test %eax,%eax + 1dfe: 74 5b je 1e5b + 1e00: 8d 9e bc 00 00 00 lea 0xbc(%esi),%ebx + 1e06: 8d 54 24 1c lea 0x1c(%esp,1),%edx + 1e0a: 89 d8 mov %ebx,%eax + 1e0c: e8 fc ff ff ff call 1e0d + 1e11: 89 df mov %ebx,%edi + 1e13: eb 1e jmp 1e33 +extern void __run_task_queue(task_queue *list); + +static inline void run_task_queue(task_queue *list) +{ + if (TQ_ACTIVE(*list)) + 1e15: 81 3d 00 00 00 00 00 cmpl $0x0,0x0 + 1e1c: 00 00 00 + 1e1f: 74 0d je 1e2e + __run_task_queue(list); + 1e21: 68 00 00 00 00 push $0x0 + 1e26: e8 fc ff ff ff call 1e27 + 1e2b: 83 c4 04 add $0x4,%esp + 1e2e: e8 fc ff ff ff call 1e2f + +static inline struct task_struct * get_current(void) +{ + struct task_struct *current; + __asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL)); + 1e33: bb 00 e0 ff ff mov $0xffffe000,%ebx + 1e38: 21 e3 and %esp,%ebx + 1e3a: c7 03 02 00 00 00 movl $0x2,(%ebx) + 1e40: 8b 86 b8 00 00 00 mov 0xb8(%esi),%eax + 1e46: 85 c0 test %eax,%eax + 1e48: 75 cb jne 1e15 + 1e4a: 8d 54 24 1c lea 0x1c(%esp,1),%edx + 1e4e: 89 f8 mov %edi,%eax + 1e50: e8 fc ff ff ff call 1e51 + 1e55: c7 03 00 00 00 00 movl $0x0,(%ebx) + _pagebuf_wait_unpin(pb); + } + + /* Set the count to 1 initially, this will stop an I/O + * completion callout which happens before we have started + * all the I/O from calling iodone too early + */ + atomic_set(&PBP(pb)->pb_io_remaining, 1); + 1e5b: c7 86 b4 00 00 00 01 movl $0x1,0xb4(%esi) + 1e62: 00 00 00 + status = _pagebuf_segment_apply(pb); + 1e65: 56 push %esi + 1e66: e8 95 02 00 00 call 2100 <_pagebuf_segment_apply> + 1e6b: 89 c3 mov %eax,%ebx + * cases. Note that the guaranteed + * useful range of an atomic_t is only 24 bits. + */ +static __inline__ int atomic_dec_and_test(atomic_t *v) +{ + 1e6d: 83 c4 04 add $0x4,%esp + unsigned char c; + + __asm__ __volatile__( + 1e70: f0 ff 8e b4 00 00 00 lock decl 0xb4(%esi) + 1e77: 0f 94 c0 sete %al + + /* Drop our count and if everything worked we are done */ + if (atomic_dec_and_test(&PBP(pb)->pb_io_remaining) == 1) { + 1e7a: 84 c0 test %al,%al + 1e7c: 74 08 je 1e86 + pagebuf_iodone(pb); + 1e7e: 56 push %esi + 1e7f: e8 fc ff ff ff call 1e80 + } else if ((pb->pb_flags & (PBF_SYNC|PBF_ASYNC)) == PBF_SYNC) { + 1e84: eb 25 jmp 1eab + 1e86: 8b 46 08 mov 0x8(%esi),%eax + 1e89: 25 10 01 00 00 and $0x110,%eax + 1e8e: 3d 00 01 00 00 cmp $0x100,%eax + 1e93: 75 19 jne 1eae +extern void __run_task_queue(task_queue *list); + +static inline void run_task_queue(task_queue *list) +{ + if (TQ_ACTIVE(*list)) + 1e95: 81 3d 00 00 00 00 00 cmpl $0x0,0x0 + 1e9c: 00 00 00 + 1e9f: 74 0d je 1eae + __run_task_queue(list); + 1ea1: 68 00 00 00 00 push $0x0 + 1ea6: e8 fc ff ff ff call 1ea7 + 1eab: 83 c4 04 add $0x4,%esp + run_task_queue(&tq_disk); + } + + return status < 0 ? status : 0; + 1eae: 31 c0 xor %eax,%eax + 1eb0: 83 fb 01 cmp $0x1,%ebx + 1eb3: 0f 4c c3 cmovl %ebx,%eax +} + 1eb6: 5b pop %ebx + 1eb7: 5e pop %esi + 1eb8: 5f pop %edi + 1eb9: 83 c4 20 add $0x20,%esp + 1ebc: c3 ret + 1ebd: 8d 76 00 lea 0x0(%esi),%esi + +0000000000001ec0 : + +/* + * pagebuf_iowait + * + * pagebuf_iowait waits for I/O to complete on the buffer supplied. + * It returns immediately if no I/O is pending. In any case, it returns + * the error code, if any, or 0 if there is no error. + */ +int +pagebuf_iowait( + page_buf_t *pb) +{ + 1ec0: 53 push %ebx + 1ec1: 8b 5c 24 08 mov 0x8(%esp,1),%ebx +extern void __run_task_queue(task_queue *list); + +static inline void run_task_queue(task_queue *list) +{ + if (TQ_ACTIVE(*list)) + 1ec5: 81 3d 00 00 00 00 00 cmpl $0x0,0x0 + 1ecc: 00 00 00 + 1ecf: 74 0f je 1ee0 + __run_task_queue(list); + 1ed1: 68 00 00 00 00 push $0x0 + 1ed6: e8 fc ff ff ff call 1ed7 + 1edb: 83 c4 04 add $0x4,%esp +} + 1ede: 89 f6 mov %esi,%esi + * "__down_failed" is a special asm handler that calls the C + * routine that actually waits. See arch/i386/kernel/semaphore.c + */ +static inline void down(struct semaphore * sem) +{ + 1ee0: 8d 4b 58 lea 0x58(%ebx),%ecx +#if WAITQUEUE_DEBUG + CHECK_MAGIC(sem->__magic); +#endif + + __asm__ __volatile__( + 1ee3: f0 ff 4b 58 lock decl 0x58(%ebx) + 1ee7: 0f 88 86 0f 00 00 js 2e73 + PB_TRACE(pb, PB_TRACE_REC(iowait), 0); + run_task_queue(&tq_disk); + down(&pb->pb_iodonesema); + PB_TRACE(pb, PB_TRACE_REC(iowaited), (int)pb->pb_error); + return pb->pb_error; + 1eed: 0f b7 43 7c movzwl 0x7c(%ebx),%eax + 1ef1: 5b pop %ebx + 1ef2: c3 ret +} + 1ef3: 90 nop + +0000000000001ef4 : + +STATIC void * +pagebuf_mapout_locked( + page_buf_t *pb) +{ + 1ef4: 53 push %ebx + 1ef5: 8b 4c 24 08 mov 0x8(%esp,1),%ecx + void *old_addr = NULL; + 1ef9: 31 c0 xor %eax,%eax + + if (pb->pb_flags & PBF_MAPPED) { + 1efb: 8b 51 08 mov 0x8(%ecx),%edx + 1efe: f6 c2 04 test $0x4,%dl + 1f01: 74 26 je 1f29 + if (pb->pb_flags & _PBF_ADDR_ALLOCATED) + 1f03: f7 c2 00 00 00 01 test $0x1000000,%edx + 1f09: 74 0e je 1f19 + old_addr = pb->pb_addr - pb->pb_offset; + 1f0b: 8b 59 34 mov 0x34(%ecx),%ebx + 1f0e: 0f b7 81 80 00 00 00 movzwl 0x80(%ecx),%eax + 1f15: 29 c3 sub %eax,%ebx + 1f17: 89 d8 mov %ebx,%eax + pb->pb_addr = NULL; + 1f19: c7 41 34 00 00 00 00 movl $0x0,0x34(%ecx) + pb->pb_flags &= ~(PBF_MAPPED | _PBF_ADDR_ALLOCATED); + 1f20: 81 e2 fb ff ff fe and $0xfefffffb,%edx + 1f26: 89 51 08 mov %edx,0x8(%ecx) + } + + return old_addr; /* Caller must free the address space, + 1f29: 5b pop %ebx + 1f2a: c3 ret + * we are under a spin lock, probably + * not safe to do vfree here + */ +} + 1f2b: 90 nop + +0000000000001f2c : + +caddr_t +pagebuf_offset( + page_buf_t *pb, + off_t offset) +{ + 1f2c: 8b 44 24 04 mov 0x4(%esp,1),%eax + struct page *page; + + offset += pb->pb_offset; + 1f30: 0f b7 90 80 00 00 00 movzwl 0x80(%eax),%edx + + page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT]; + 1f37: 8b 80 84 00 00 00 mov 0x84(%eax),%eax + 1f3d: 03 54 24 08 add 0x8(%esp,1),%edx + 1f41: 89 d1 mov %edx,%ecx + 1f43: c1 f9 0c sar $0xc,%ecx + 1f46: 8b 04 88 mov (%eax,%ecx,4),%eax + return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1)); + 1f49: 81 e2 ff 0f 00 00 and $0xfff,%edx + 1f4f: 03 50 2c add 0x2c(%eax),%edx + 1f52: 89 d0 mov %edx,%eax + 1f54: c3 ret +} + 1f55: 8d 76 00 lea 0x0(%esi),%esi + +0000000000001f58 : + +/* + * pagebuf_segment + * + * pagebuf_segment is used to retrieve the various contiguous + * segments of a buffer. The variable addressed by the + * loff_t * should be initialized to 0, and successive + * calls will update to point to the segment following the one + * returned. + */ +STATIC void +pagebuf_segment( + page_buf_t *pb, /* buffer to examine */ + loff_t *boff_p,/* offset in buffer of next */ + /* next segment (updated) */ + struct page **spage_p, /* page (updated) */ + /* (NULL if not in page array) */ + size_t *soff_p,/* offset in page (updated) */ + size_t *ssize_p) /* segment length (updated) */ +{ + loff_t kpboff; /* offset in pagebuf */ + int kpi; /* page index in pagebuf */ + size_t slen; /* segment length */ + + kpboff = *boff_p; + 1f58: 83 ec 08 sub $0x8,%esp + 1f5b: 55 push %ebp + 1f5c: 57 push %edi + 1f5d: 56 push %esi + 1f5e: 53 push %ebx + 1f5f: 8b 44 24 20 mov 0x20(%esp,1),%eax + + kpi = page_buf_btoct(kpboff + pb->pb_offset); + 1f63: 8b 4c 24 1c mov 0x1c(%esp,1),%ecx + 1f67: 8b 30 mov (%eax),%esi + 1f69: 8b 78 04 mov 0x4(%eax),%edi + 1f6c: 0f b7 81 80 00 00 00 movzwl 0x80(%ecx),%eax + 1f73: 89 f1 mov %esi,%ecx + 1f75: 89 fb mov %edi,%ebx + 1f77: 01 c1 add %eax,%ecx + 1f79: 83 d3 00 adc $0x0,%ebx + 1f7c: 89 da mov %ebx,%edx + + *spage_p = pb->pb_pages[kpi]; + 1f7e: 8b 5c 24 1c mov 0x1c(%esp,1),%ebx + 1f82: 89 c8 mov %ecx,%eax + 1f84: 0f ac d0 0c shrd $0xc,%edx,%eax + 1f88: c1 fa 0c sar $0xc,%edx + 1f8b: 89 c2 mov %eax,%edx + 1f8d: 8b 83 84 00 00 00 mov 0x84(%ebx),%eax + + *soff_p = page_buf_poff(kpboff + pb->pb_offset); + slen = PAGE_CACHE_SIZE - *soff_p; + 1f93: bd 00 10 00 00 mov $0x1000,%ebp + 1f98: 8b 04 90 mov (%eax,%edx,4),%eax + 1f9b: 8b 54 24 24 mov 0x24(%esp,1),%edx + 1f9f: 89 02 mov %eax,(%edx) + 1fa1: 8b 4c 24 28 mov 0x28(%esp,1),%ecx + 1fa5: 0f b7 83 80 00 00 00 movzwl 0x80(%ebx),%eax + 1fac: 01 f0 add %esi,%eax + 1fae: 25 ff 0f 00 00 and $0xfff,%eax + 1fb3: 89 01 mov %eax,(%ecx) + if (slen > (pb->pb_count_desired - kpboff)) + 1fb5: 8b 54 24 1c mov 0x1c(%esp,1),%edx + 1fb9: 29 c5 sub %eax,%ebp + 1fbb: 89 e9 mov %ebp,%ecx + 1fbd: 31 db xor %ebx,%ebx + 1fbf: 8b 42 30 mov 0x30(%edx),%eax + 1fc2: 31 d2 xor %edx,%edx + 1fc4: 89 44 24 10 mov %eax,0x10(%esp,1) + 1fc8: 89 54 24 14 mov %edx,0x14(%esp,1) + 1fcc: 8b 44 24 10 mov 0x10(%esp,1),%eax + 1fd0: 8b 54 24 14 mov 0x14(%esp,1),%edx + 1fd4: 29 f0 sub %esi,%eax + 1fd6: 19 fa sbb %edi,%edx + 1fd8: 39 d3 cmp %edx,%ebx + 1fda: 7f 06 jg 1fe2 + 1fdc: 75 0d jne 1feb + 1fde: 39 c1 cmp %eax,%ecx + 1fe0: 76 09 jbe 1feb + slen = (pb->pb_count_desired - kpboff); + 1fe2: 8b 4c 24 1c mov 0x1c(%esp,1),%ecx + 1fe6: 8b 69 30 mov 0x30(%ecx),%ebp + 1fe9: 29 f5 sub %esi,%ebp + *ssize_p = slen; + 1feb: 8b 44 24 2c mov 0x2c(%esp,1),%eax + 1fef: 89 28 mov %ebp,(%eax) + + *boff_p = *boff_p + slen; + 1ff1: 8b 5c 24 20 mov 0x20(%esp,1),%ebx + 1ff5: 01 2b add %ebp,(%ebx) + 1ff7: 83 53 04 00 adcl $0x0,0x4(%ebx) + 1ffb: 5b pop %ebx + 1ffc: 5e pop %esi + 1ffd: 5f pop %edi + 1ffe: 5d pop %ebp + 1fff: 59 pop %ecx + 2000: 5a pop %edx + 2001: c3 ret +} + 2002: 89 f6 mov %esi,%esi + +0000000000002004 : + +/* + * pagebuf_iomove + * + * Move data into or out of a buffer. + */ +void +pagebuf_iomove( + page_buf_t *pb, /* buffer to process */ + off_t boff, /* starting buffer offset */ + size_t bsize, /* length to copy */ + caddr_t data, /* data address */ + page_buf_rw_t mode) /* read/write flag */ +{ + 2004: 55 push %ebp + 2005: 89 e5 mov %esp,%ebp + 2007: 83 ec 14 sub $0x14,%esp + 200a: 57 push %edi + 200b: 56 push %esi + 200c: 53 push %ebx + 200d: 8b 5d 14 mov 0x14(%ebp),%ebx + loff_t cboff; + size_t cpoff; + size_t csize; + struct page *page; + + cboff = boff; + 2010: 8b 45 0c mov 0xc(%ebp),%eax + 2013: 99 cltd + 2014: 89 55 fc mov %edx,0xfffffffc(%ebp) + boff += bsize; /* last */ + 2017: 8b 55 10 mov 0x10(%ebp),%edx + 201a: 89 45 f8 mov %eax,0xfffffff8(%ebp) + 201d: 01 d0 add %edx,%eax + 201f: 89 45 0c mov %eax,0xc(%ebp) + + while (cboff < boff) { + 2022: 89 c2 mov %eax,%edx + 2024: 89 c1 mov %eax,%ecx + 2026: 8d 45 f8 lea 0xfffffff8(%ebp),%eax + 2029: 8b 70 04 mov 0x4(%eax),%esi + 202c: c1 f9 1f sar $0x1f,%ecx + 202f: 39 f1 cmp %esi,%ecx + 2031: 7f 0f jg 2042 + 2033: 0f 85 ba 00 00 00 jne 20f3 + 2039: 3b 55 f8 cmp 0xfffffff8(%ebp),%edx + 203c: 0f 86 b1 00 00 00 jbe 20f3 + pagebuf_segment(pb, &cboff, &page, &cpoff, &csize); + 2042: 8d 45 f4 lea 0xfffffff4(%ebp),%eax + 2045: 50 push %eax + 2046: 8d 45 f0 lea 0xfffffff0(%ebp),%eax + 2049: 50 push %eax + 204a: 8d 45 ec lea 0xffffffec(%ebp),%eax + 204d: 50 push %eax + 204e: 8d 45 f8 lea 0xfffffff8(%ebp),%eax + 2051: 50 push %eax + 2052: 8b 55 08 mov 0x8(%ebp),%edx + 2055: 52 push %edx + 2056: e8 fd fe ff ff call 1f58 + assert(((csize + cpoff) <= PAGE_CACHE_SIZE)); + 205b: 83 c4 14 add $0x14,%esp + + switch (mode) { + 205e: 83 7d 18 02 cmpl $0x2,0x18(%ebp) + 2062: 74 4c je 20b0 + 2064: 77 0a ja 2070 + 2066: 83 7d 18 01 cmpl $0x1,0x18(%ebp) + 206a: 74 34 je 20a0 + 206c: eb 62 jmp 20d0 + 206e: 89 f6 mov %esi,%esi + 2070: 83 7d 18 03 cmpl $0x3,0x18(%ebp) + 2074: 75 5a jne 20d0 + case PBRW_ZERO: + memset(page_address(page) + cpoff, 0, csize); + 2076: 8b 45 ec mov 0xffffffec(%ebp),%eax + * things 32 bits at a time even when we don't know the size of the + * area at compile-time.. + */ +static inline void * __constant_c_memset(void * s, unsigned long c, size_t count) +{ + 2079: 8b 7d f0 mov 0xfffffff0(%ebp),%edi + 207c: 8b 75 f4 mov 0xfffffff4(%ebp),%esi +int d0, d1; +__asm__ __volatile__( + 207f: 89 f1 mov %esi,%ecx + 2081: c1 e9 02 shr $0x2,%ecx + 2084: 89 f2 mov %esi,%edx + 2086: 03 78 2c add 0x2c(%eax),%edi + 2089: 31 c0 xor %eax,%eax + 208b: f3 ab repz stos %eax,%es:(%edi) + 208d: f6 c2 02 test $0x2,%dl + 2090: 74 02 je 2094 + 2092: 66 ab stos %ax,%es:(%edi) + 2094: f6 c2 01 test $0x1,%dl + 2097: 74 01 je 209a + 2099: aa stos %al,%es:(%edi) + break; + 209a: eb 34 jmp 20d0 + case PBRW_READ: + memcpy(data, page_address(page) + cpoff, csize); + 209c: 8d 74 26 00 lea 0x0(%esi,1),%esi + 20a0: 8b 45 ec mov 0xffffffec(%ebp),%eax +return __res; +} + +static inline void * __memcpy(void * to, const void * from, size_t n) +{ + 20a3: 8b 75 f0 mov 0xfffffff0(%ebp),%esi +int d0, d1, d2; +__asm__ __volatile__( + 20a6: 89 df mov %ebx,%edi + 20a8: 03 70 2c add 0x2c(%eax),%esi + break; + 20ab: eb 0e jmp 20bb + case PBRW_WRITE: + memcpy(page_address(page) + cpoff, data, csize); + 20ad: 8d 76 00 lea 0x0(%esi),%esi + 20b0: 8b 45 ec mov 0xffffffec(%ebp),%eax +return __res; +} + +static inline void * __memcpy(void * to, const void * from, size_t n) +{ + 20b3: 8b 7d f0 mov 0xfffffff0(%ebp),%edi +int d0, d1, d2; +__asm__ __volatile__( + 20b6: 89 de mov %ebx,%esi + 20b8: 03 78 2c add 0x2c(%eax),%edi + 20bb: 8b 45 f4 mov 0xfffffff4(%ebp),%eax + 20be: 89 c1 mov %eax,%ecx + 20c0: c1 e9 02 shr $0x2,%ecx + 20c3: f3 a5 repz movsl %ds:(%esi),%es:(%edi) + 20c5: a8 02 test $0x2,%al + 20c7: 74 02 je 20cb + 20c9: 66 a5 movsw %ds:(%esi),%es:(%edi) + 20cb: a8 01 test $0x1,%al + 20cd: 74 01 je 20d0 + 20cf: a4 movsb %ds:(%esi),%es:(%edi) + } + + data += csize; + } + 20d0: 8b 45 0c mov 0xc(%ebp),%eax + 20d3: 89 c2 mov %eax,%edx + 20d5: 89 c1 mov %eax,%ecx + 20d7: 8b 45 fc mov 0xfffffffc(%ebp),%eax + 20da: 03 5d f4 add 0xfffffff4(%ebp),%ebx + 20dd: c1 f9 1f sar $0x1f,%ecx + 20e0: 39 c1 cmp %eax,%ecx + 20e2: 0f 8f 5a ff ff ff jg 2042 + 20e8: 75 09 jne 20f3 + 20ea: 3b 55 f8 cmp 0xfffffff8(%ebp),%edx + 20ed: 0f 87 4f ff ff ff ja 2042 + 20f3: 8d 65 e0 lea 0xffffffe0(%ebp),%esp + 20f6: 5b pop %ebx + 20f7: 5e pop %esi + 20f8: 5f pop %edi + 20f9: 89 ec mov %ebp,%esp + 20fb: 5d pop %ebp + 20fc: c3 ret +} + 20fd: 8d 76 00 lea 0x0(%esi),%esi + +0000000000002100 <_pagebuf_segment_apply>: + +/* + * _pagebuf_segment_apply + * + * Applies _page_buf_page_apply to each segment of the page_buf_t. + */ +STATIC int +_pagebuf_segment_apply( /* apply function to segments */ + page_buf_t *pb) /* buffer to examine */ +{ + int buf_index, sval, status = 0; + loff_t buffer_offset = pb->pb_file_offset; + 2100: 83 ec 14 sub $0x14,%esp + 2103: 55 push %ebp + 2104: 57 push %edi + 2105: 56 push %esi + 2106: 53 push %ebx + 2107: 8b 54 24 28 mov 0x28(%esp,1),%edx + 210b: 8b 42 24 mov 0x24(%edx),%eax + 210e: 8b 52 28 mov 0x28(%edx),%edx + 2111: 89 44 24 18 mov %eax,0x18(%esp,1) + size_t buffer_len = pb->pb_count_desired; + 2115: 8b 44 24 28 mov 0x28(%esp,1),%eax + 2119: 89 54 24 1c mov %edx,0x1c(%esp,1) + 211d: 8b 58 30 mov 0x30(%eax),%ebx + size_t page_offset, len, total = 0; + 2120: c7 44 24 14 00 00 00 movl $0x0,0x14(%esp,1) + 2127: 00 + size_t cur_offset, cur_len; + + pagebuf_hold(pb); + 2128: 50 push %eax + 2129: e8 fc ff ff ff call 212a <_pagebuf_segment_apply+0x2a> + + cur_offset = pb->pb_offset; + 212e: 8b 54 24 2c mov 0x2c(%esp,1),%edx + cur_len = buffer_len; + 2132: 89 de mov %ebx,%esi + + for (buf_index = 0; buf_index < pb->pb_page_count; buf_index++) { + 2134: 31 ed xor %ebp,%ebp + 2136: 83 c4 04 add $0x4,%esp + 2139: 0f b7 ba 80 00 00 00 movzwl 0x80(%edx),%edi + 2140: c7 44 24 20 00 00 00 movl $0x0,0x20(%esp,1) + 2147: 00 + 2148: 66 83 7a 7e 00 cmpw $0x0,0x7e(%edx) + 214d: 0f 84 9e 00 00 00 je 21f1 <_pagebuf_segment_apply+0xf1> + if (cur_len == 0) + 2153: 85 f6 test %esi,%esi + 2155: 0f 84 96 00 00 00 je 21f1 <_pagebuf_segment_apply+0xf1> + break; + if (cur_offset >= PAGE_CACHE_SIZE) { + 215b: 81 ff ff 0f 00 00 cmp $0xfff,%edi + 2161: 76 0d jbe 2170 <_pagebuf_segment_apply+0x70> + cur_offset -= PAGE_CACHE_SIZE; + 2163: 81 c7 00 f0 ff ff add $0xfffff000,%edi + continue; + 2169: 8d 55 01 lea 0x1(%ebp),%edx + 216c: eb 71 jmp 21df <_pagebuf_segment_apply+0xdf> + } + 216e: 89 f6 mov %esi,%esi + + page_offset = cur_offset; + cur_offset = 0; + + len = PAGE_CACHE_SIZE - page_offset; + if (len > cur_len) + len = cur_len; + cur_len -= len; + + sval = _page_buf_page_apply(pb, buffer_offset, + 2170: 8b 44 24 28 mov 0x28(%esp,1),%eax + 2174: 89 fa mov %edi,%edx + 2176: b9 00 10 00 00 mov $0x1000,%ecx + 217b: 29 d1 sub %edx,%ecx + 217d: 39 f1 cmp %esi,%ecx + 217f: 0f 47 ce cmova %esi,%ecx + 2182: 8d 5d 01 lea 0x1(%ebp),%ebx + 2185: 0f b7 40 7e movzwl 0x7e(%eax),%eax + 2189: 39 c3 cmp %eax,%ebx + 218b: 0f 94 c0 sete %al + 218e: 0f b6 c0 movzbl %al,%eax + 2191: 50 push %eax + 2192: 51 push %ecx + 2193: 52 push %edx + 2194: 8b 54 24 34 mov 0x34(%esp,1),%edx + 2198: 8b 82 84 00 00 00 mov 0x84(%edx),%eax + 219e: 31 ff xor %edi,%edi + 21a0: 29 ce sub %ecx,%esi + 21a2: 8b 04 a8 mov (%eax,%ebp,4),%eax + 21a5: 50 push %eax + 21a6: 8b 44 24 28 mov 0x28(%esp,1),%eax + 21aa: 8b 54 24 2c mov 0x2c(%esp,1),%edx + 21ae: 52 push %edx + 21af: 50 push %eax + 21b0: 8b 54 24 40 mov 0x40(%esp,1),%edx + 21b4: 52 push %edx + 21b5: e8 9e fa ff ff call 1c58 <_page_buf_page_apply> + pb->pb_pages[buf_index], page_offset, len, + buf_index+1 == pb->pb_page_count); + if (sval <= 0) { + 21ba: 83 c4 1c add $0x1c,%esp + 21bd: 89 da mov %ebx,%edx + 21bf: 85 c0 test %eax,%eax + 21c1: 7f 0d jg 21d0 <_pagebuf_segment_apply+0xd0> + status = sval; + 21c3: 89 44 24 20 mov %eax,0x20(%esp,1) + break; + 21c7: eb 28 jmp 21f1 <_pagebuf_segment_apply+0xf1> + 21c9: 8d b4 26 00 00 00 00 lea 0x0(%esi,1),%esi + } else { + len = sval; + 21d0: 89 c1 mov %eax,%ecx + total += len; + 21d2: 01 4c 24 14 add %ecx,0x14(%esp,1) + } + + buffer_offset += len; + 21d6: 01 4c 24 18 add %ecx,0x18(%esp,1) + 21da: 83 54 24 1c 00 adcl $0x0,0x1c(%esp,1) + 21df: 89 d5 mov %edx,%ebp + 21e1: 8b 54 24 28 mov 0x28(%esp,1),%edx + 21e5: 0f b7 42 7e movzwl 0x7e(%edx),%eax + 21e9: 39 c5 cmp %eax,%ebp + 21eb: 0f 8c 62 ff ff ff jl 2153 <_pagebuf_segment_apply+0x53> + buffer_len -= len; + } + + pagebuf_rele(pb); + 21f1: 8b 44 24 28 mov 0x28(%esp,1),%eax + 21f5: 50 push %eax + 21f6: e8 fc ff ff ff call 21f7 <_pagebuf_segment_apply+0xf7> + + if (!status) + 21fb: 83 c4 04 add $0x4,%esp + 21fe: 8b 54 24 14 mov 0x14(%esp,1),%edx + 2202: 83 7c 24 20 00 cmpl $0x0,0x20(%esp,1) + 2207: 0f 45 54 24 20 cmovne 0x20(%esp,1),%edx + 220c: 89 54 24 20 mov %edx,0x20(%esp,1) + status = total; + + return (status); + 2210: 89 d0 mov %edx,%eax + 2212: 5b pop %ebx + 2213: 5e pop %esi + 2214: 5f pop %edi + 2215: 5d pop %ebp + 2216: 83 c4 14 add $0x14,%esp + 2219: c3 ret +} + 221a: 89 f6 mov %esi,%esi + +000000000000221c : + + +/* + * Pagebuf delayed write buffer handling + */ + +void +pagebuf_delwri_queue( + page_buf_t *pb, + int unlock) +{ + 221c: 57 push %edi + 221d: 56 push %esi + 221e: 53 push %ebx + 221f: 8b 5c 24 10 mov 0x10(%esp,1),%ebx + 2223: 8b 7c 24 14 mov 0x14(%esp,1),%edi + return oldval > 0; +} + +static inline void spin_lock(spinlock_t *lock) +{ + 2227: 8b 35 28 00 00 00 mov 0x28,%esi + 222d: 83 c6 08 add $0x8,%esi +#if SPINLOCK_DEBUG + __label__ here; +here: + if (lock->magic != SPINLOCK_MAGIC) { + 2230: 81 7e 04 ad 4e ad de cmpl $0xdead4ead,0x4(%esi) + 2237: 74 1a je 2253 +printk("eip: %p\n", &&here); + 2239: 68 30 22 00 00 push $0x2230 + 223e: 68 2b 00 00 00 push $0x2b + 2243: e8 fc ff ff ff call 2244 + BUG(); + 2248: 0f 0b ud2a + 224a: 85 00 test %eax,(%eax) + 224c: 00 00 add %al,(%eax) + 224e: 00 00 add %al,(%eax) + } + 2250: 83 c4 08 add $0x8,%esp +#endif + __asm__ __volatile__( + 2253: f0 fe 0e lock decb (%esi) + 2256: 0f 88 21 0c 00 00 js 2e7d + PB_TRACE(pb, PB_TRACE_REC(delwri_q), unlock); + spin_lock(&pb_daemon->pb_delwrite_lock); + /* If already in the queue, dequeue and place at tail */ + if (!list_empty(&pb->pb_list)) { + 225c: 39 1b cmp %ebx,(%ebx) + 225e: 74 14 je 2274 + if (unlock) { + 2260: 85 ff test %edi,%edi + 2262: 74 04 je 2268 + * useful range of an atomic_t is only 24 bits. + */ +static __inline__ void atomic_dec(atomic_t *v) +{ + __asm__ __volatile__( + 2264: f0 ff 4b 18 lock decl 0x18(%ebx) + * the prev/next entries already! + */ +static __inline__ void __list_del(struct list_head * prev, + struct list_head * next) +{ + 2268: 8b 53 04 mov 0x4(%ebx),%edx + 226b: 8b 03 mov (%ebx),%eax + next->prev = prev; + 226d: 89 50 04 mov %edx,0x4(%eax) + prev->next = next; + 2270: 89 02 mov %eax,(%edx) + atomic_dec(&pb->pb_hold); + } + list_del(&pb->pb_list); + } else { + 2272: eb 08 jmp 227c + pb_daemon->pb_delwri_cnt++; + 2274: a1 28 00 00 00 mov 0x28,%eax + 2279: ff 40 18 incl 0x18(%eax) + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static __inline__ void list_add_tail(struct list_head *new, struct list_head *head) +{ + 227c: 8b 0d 28 00 00 00 mov 0x28,%ecx + 2282: 8d 41 10 lea 0x10(%ecx),%eax + 2285: 8b 50 04 mov 0x4(%eax),%edx + 2288: 89 58 04 mov %ebx,0x4(%eax) + 228b: 89 03 mov %eax,(%ebx) + } + list_add_tail(&pb->pb_list, &pb_daemon->pb_delwrite_l); + PBP(pb)->pb_flushtime = jiffies + pb_params.p_un.age_buffer; + 228d: a1 00 00 00 00 mov 0x0,%eax + struct list_head * next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + 2292: 89 53 04 mov %edx,0x4(%ebx) + prev->next = new; + 2295: 89 1a mov %ebx,(%edx) + 2297: 03 05 04 00 00 00 add 0x4,%eax + 229d: 89 83 b0 00 00 00 mov %eax,0xb0(%ebx) + :"0" (oldval) : "memory" + +static inline void spin_unlock(spinlock_t *lock) +{ + char oldval = 1; + 22a3: b2 01 mov $0x1,%dl +#if SPINLOCK_DEBUG + if (lock->magic != SPINLOCK_MAGIC) + 22a5: 81 79 0c ad 4e ad de cmpl $0xdead4ead,0xc(%ecx) + 22ac: 74 08 je 22b6 + BUG(); + 22ae: 0f 0b ud2a + 22b0: 69 00 00 00 00 00 imul $0x0,(%eax),%eax + if (!spin_is_locked(lock)) + 22b6: 8a 41 08 mov 0x8(%ecx),%al + 22b9: 84 c0 test %al,%al + 22bb: 7e 08 jle 22c5 + BUG(); + 22bd: 0f 0b ud2a + 22bf: 6b 00 00 imul $0x0,(%eax),%eax + 22c2: 00 00 add %al,(%eax) + 22c4: 00 86 51 08 85 ff add %al,0xff850851(%esi) + spin_unlock(&pb_daemon->pb_delwrite_lock); + + if (unlock && (pb->pb_flags & _PBF_LOCKABLE)) { + 22ca: 74 0f je 22db + 22cc: f6 43 0a 08 testb $0x8,0xa(%ebx) + 22d0: 74 09 je 22db + pagebuf_unlock(pb); + 22d2: 53 push %ebx + 22d3: e8 fc ff ff ff call 22d4 + } + 22d8: 83 c4 04 add $0x4,%esp + 22db: 5b pop %ebx + 22dc: 5e pop %esi + 22dd: 5f pop %edi + 22de: c3 ret +} + 22df: 90 nop + +00000000000022e0 : + +void +pagebuf_delwri_dequeue( + page_buf_t *pb) +{ + 22e0: 56 push %esi + 22e1: 53 push %ebx + 22e2: 8b 5c 24 0c mov 0xc(%esp,1),%ebx + return oldval > 0; +} + +static inline void spin_lock(spinlock_t *lock) +{ + 22e6: 8b 35 28 00 00 00 mov 0x28,%esi + 22ec: 83 c6 08 add $0x8,%esi + 22ef: 90 nop +#if SPINLOCK_DEBUG + __label__ here; +here: + if (lock->magic != SPINLOCK_MAGIC) { + 22f0: 81 7e 04 ad 4e ad de cmpl $0xdead4ead,0x4(%esi) + 22f7: 74 1a je 2313 +printk("eip: %p\n", &&here); + 22f9: 68 f0 22 00 00 push $0x22f0 + 22fe: 68 2b 00 00 00 push $0x2b + 2303: e8 fc ff ff ff call 2304 + BUG(); + 2308: 0f 0b ud2a + 230a: 85 00 test %eax,(%eax) + 230c: 00 00 add %al,(%eax) + 230e: 00 00 add %al,(%eax) + } + 2310: 83 c4 08 add $0x8,%esp +#endif + __asm__ __volatile__( + 2313: f0 fe 0e lock decb (%esi) + 2316: 0f 88 6d 0b 00 00 js 2e89 + * the prev/next entries already! + */ +static __inline__ void __list_del(struct list_head * prev, + struct list_head * next) +{ + 231c: 8b 53 04 mov 0x4(%ebx),%edx + 231f: 8b 03 mov (%ebx),%eax + next->prev = prev; + 2321: 89 50 04 mov %edx,0x4(%eax) + prev->next = next; + 2324: 89 02 mov %eax,(%edx) +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty on entry does not return true after this, the entry is in an undefined state. + */ +static __inline__ void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +/** + * list_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static __inline__ void list_del_init(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); + 2326: 89 1b mov %ebx,(%ebx) + 2328: 89 5b 04 mov %ebx,0x4(%ebx) + PB_TRACE(pb, PB_TRACE_REC(delwri_uq), 0); + spin_lock(&pb_daemon->pb_delwrite_lock); + list_del_init(&pb->pb_list); + pb->pb_flags &= ~PBF_DELWRI; + 232b: 80 63 08 bf andb $0xbf,0x8(%ebx) + pb_daemon->pb_delwri_cnt--; + 232f: 8b 15 28 00 00 00 mov 0x28,%edx + :"0" (oldval) : "memory" + +static inline void spin_unlock(spinlock_t *lock) +{ + char oldval = 1; + 2335: b1 01 mov $0x1,%cl + 2337: ff 4a 18 decl 0x18(%edx) +static inline void spin_unlock(spinlock_t *lock) +{ + char oldval = 1; +#if SPINLOCK_DEBUG + if (lock->magic != SPINLOCK_MAGIC) + 233a: 81 7a 0c ad 4e ad de cmpl $0xdead4ead,0xc(%edx) + 2341: 74 0d je 2350 + BUG(); + 2343: 0f 0b ud2a + 2345: 69 00 00 00 00 00 imul $0x0,(%eax),%eax + 234b: 90 nop + 234c: 8d 74 26 00 lea 0x0(%esi,1),%esi + if (!spin_is_locked(lock)) + 2350: 8a 42 08 mov 0x8(%edx),%al + 2353: 84 c0 test %al,%al + 2355: 7e 08 jle 235f + BUG(); + 2357: 0f 0b ud2a + 2359: 6b 00 00 imul $0x0,(%eax),%eax + 235c: 00 00 add %al,(%eax) + 235e: 00 86 4a 08 5b 5e add %al,0x5e5b084a(%esi) +#endif + __asm__ __volatile__( + 2364: c3 ret + spin_unlock(&pb_daemon->pb_delwrite_lock); +} + 2365: 8d 76 00 lea 0x0(%esi),%esi + +0000000000002368 : + + +/* + * The pagebuf iodone daemon + */ + +STATIC int pb_daemons[NR_CPUS]; + +STATIC int +pagebuf_iodone_daemon( + void *__bind_cpu) +{ + 2368: 83 ec 2c sub $0x2c,%esp + 236b: 55 push %ebp + 236c: 57 push %edi + 236d: 56 push %esi + 236e: 53 push %ebx + 236f: 8b 7c 24 40 mov 0x40(%esp,1),%edi + +static inline struct task_struct * get_current(void) +{ + struct task_struct *current; + __asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL)); + 2373: bb 00 e0 ff ff mov $0xffffe000,%ebx + 2378: 21 e3 and %esp,%ebx + int bind_cpu = (int) (long) __bind_cpu; + int cpu = cpu_logical_map(bind_cpu); + DECLARE_WAITQUEUE (wait, current); + 237a: c7 44 24 1c 00 00 00 movl $0x0,0x1c(%esp,1) + 2381: 00 + 2382: c7 44 24 20 00 00 00 movl $0x0,0x20(%esp,1) + 2389: 00 + 238a: c7 44 24 24 00 00 00 movl $0x0,0x24(%esp,1) + 2391: 00 + 2392: c7 44 24 28 00 00 00 movl $0x0,0x28(%esp,1) + 2399: 00 + 239a: 89 5c 24 20 mov %ebx,0x20(%esp,1) + 239e: c7 44 24 2c 00 00 00 movl $0x0,0x2c(%esp,1) + 23a5: 00 + 23a6: 89 5c 24 30 mov %ebx,0x30(%esp,1) + 23aa: c7 44 24 34 00 00 00 movl $0x0,0x34(%esp,1) + 23b1: 00 + 23b2: c7 44 24 38 00 00 00 movl $0x0,0x38(%esp,1) + 23b9: 00 + + /* Set up the thread */ + daemonize(); + 23ba: e8 fc ff ff ff call 23bb + + /* Avoid signals */ + spin_lock_irq(¤t->sigmask_lock); + 23bf: fa cli + return oldval > 0; +} + +static inline void spin_lock(spinlock_t *lock) +{ + 23c0: 81 c3 5c 06 00 00 add $0x65c,%ebx +#if SPINLOCK_DEBUG + __label__ here; +here: + if (lock->magic != SPINLOCK_MAGIC) { + 23c6: 81 7b 04 ad 4e ad de cmpl $0xdead4ead,0x4(%ebx) + 23cd: 74 1a je 23e9 +printk("eip: %p\n", &&here); + 23cf: 68 c6 23 00 00 push $0x23c6 + 23d4: 68 2b 00 00 00 push $0x2b + 23d9: e8 fc ff ff ff call 23da + BUG(); + 23de: 0f 0b ud2a + 23e0: 85 00 test %eax,(%eax) + 23e2: 00 00 add %al,(%eax) + 23e4: 00 00 add %al,(%eax) + } + 23e6: 83 c4 08 add $0x8,%esp +#endif + __asm__ __volatile__( + 23e9: f0 fe 0b lock decb (%ebx) + 23ec: 0f 88 a3 0a 00 00 js 2e95 + +static inline struct task_struct * get_current(void) +{ + struct task_struct *current; + __asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL)); + 23f2: bb 00 e0 ff ff mov $0xffffe000,%ebx + 23f7: 21 e3 and %esp,%ebx + switch (_NSIG_WORDS) { + default: + memset(set, -1, sizeof(sigset_t)); + break; + case 2: set->sig[1] = -1; + 23f9: c7 83 6c 06 00 00 ff movl $0xffffffff,0x66c(%ebx) + 2400: ff ff ff + case 1: set->sig[0] = -1; + 2403: c7 83 68 06 00 00 ff movl $0xffffffff,0x668(%ebx) + 240a: ff ff ff + All callers should have t->sigmask_lock. */ + +static inline void recalc_sigpending(struct task_struct *t) +{ + t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked); + 240d: c7 43 08 00 00 00 00 movl $0x0,0x8(%ebx) + +static inline struct task_struct * get_current(void) +{ + struct task_struct *current; + __asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL)); + 2414: 89 d8 mov %ebx,%eax + :"0" (oldval) : "memory" + +static inline void spin_unlock(spinlock_t *lock) +{ + char oldval = 1; + 2416: b2 01 mov $0x1,%dl +#if SPINLOCK_DEBUG + if (lock->magic != SPINLOCK_MAGIC) + 2418: 81 b8 60 06 00 00 ad cmpl $0xdead4ead,0x660(%eax) + 241f: 4e ad de + 2422: 74 0c je 2430 + BUG(); + 2424: 0f 0b ud2a + 2426: 69 00 00 00 00 00 imul $0x0,(%eax),%eax + 242c: 8d 74 26 00 lea 0x0(%esi,1),%esi + if (!spin_is_locked(lock)) + 2430: 8a 80 5c 06 00 00 mov 0x65c(%eax),%al + 2436: 84 c0 test %al,%al + 2438: 7e 08 jle 2442 + BUG(); + 243a: 0f 0b ud2a + 243c: 6b 00 00 imul $0x0,(%eax),%eax + 243f: 00 00 add %al,(%eax) + 2441: 00 86 93 5c 06 00 add %al,0x65c93(%esi) +#endif + __asm__ __volatile__( + 2447: 00 fb add %bh,%bl + sigfillset(¤t->blocked); + recalc_sigpending(current); + spin_unlock_irq(¤t->sigmask_lock); + + /* Migrate to the right CPU */ + current->cpus_allowed = 1UL << cpu; + 2449: b8 01 00 00 00 mov $0x1,%eax + 244e: 89 f9 mov %edi,%ecx + 2450: d3 e0 shl %cl,%eax + 2452: 89 43 38 mov %eax,0x38(%ebx) + while (smp_processor_id() != cpu) + 2455: bb 40 00 00 00 mov $0x40,%ebx + 245a: 8d 04 fd 00 00 00 00 lea 0x0(,%edi,8),%eax + 2461: 89 44 24 14 mov %eax,0x14(%esp,1) + 2465: 89 fe mov %edi,%esi + 2467: c1 e6 04 shl $0x4,%esi + 246a: 8d 14 bd 00 00 00 00 lea 0x0(,%edi,4),%edx + 2471: 89 54 24 18 mov %edx,0x18(%esp,1) + 2475: 8d 4c 24 2c lea 0x2c(%esp,1),%ecx + 2479: 89 4c 24 10 mov %ecx,0x10(%esp,1) + 247d: eb 06 jmp 2485 + 247f: 90 nop + schedule(); + 2480: e8 fc ff ff ff call 2481 + +static inline struct task_struct * get_current(void) +{ + struct task_struct *current; + __asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL)); + 2485: bd 00 e0 ff ff mov $0xffffe000,%ebp + 248a: 21 e5 and %esp,%ebp + 248c: 39 7d 30 cmp %edi,0x30(%ebp) + 248f: 75 ef jne 2480 + + sprintf(current->comm, "pagebuf_io_CPU%d", bind_cpu); + 2491: 57 push %edi + 2492: 68 4e 00 00 00 push $0x4e + 2497: 8d 85 3e 03 00 00 lea 0x33e(%ebp),%eax + 249d: 50 push %eax + 249e: e8 fc ff ff ff call 249f + INIT_LIST_HEAD(&pagebuf_iodone_tq[cpu]); + 24a3: 83 c4 0c add $0xc,%esp + 24a6: 8b 4c 24 14 mov 0x14(%esp,1),%ecx + 24aa: 81 c1 40 00 00 00 add $0x40,%ecx + 24b0: 89 0c fb mov %ecx,(%ebx,%edi,8) + 24b3: 89 4c fb 04 mov %ecx,0x4(%ebx,%edi,8) +#define DECLARE_WAIT_QUEUE_HEAD(name) \ + wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name) + +static inline void init_waitqueue_head(wait_queue_head_t *q) +{ + 24b7: 8d 9e 40 01 00 00 lea 0x140(%esi),%ebx +#if WAITQUEUE_DEBUG + if (!q) + WQ_BUG(); +#endif + q->lock = WAITQUEUE_RW_LOCK_UNLOCKED; + 24bd: b8 01 00 00 00 mov $0x1,%eax + 24c2: ba ad 4e ad de mov $0xdead4ead,%edx + 24c7: 89 86 40 01 00 00 mov %eax,0x140(%esi) + 24cd: 89 96 44 01 00 00 mov %edx,0x144(%esi) + INIT_LIST_HEAD(&q->task_list); + 24d3: 8d 43 08 lea 0x8(%ebx),%eax + 24d6: 89 43 08 mov %eax,0x8(%ebx) + 24d9: 89 86 4c 01 00 00 mov %eax,0x14c(%esi) + init_waitqueue_head(&pagebuf_iodone_wait[cpu]); + __set_current_state(TASK_INTERRUPTIBLE); + 24df: c7 45 00 01 00 00 00 movl $0x1,0x0(%ebp) + mb(); + 24e6: f0 83 44 24 00 00 lock addl $0x0,0x0(%esp,1) + + pb_daemons[cpu] = 1; + 24ec: 8b 54 24 18 mov 0x18(%esp,1),%edx + 24f0: b8 00 06 00 00 mov $0x600,%eax + 24f5: c7 04 02 01 00 00 00 movl $0x1,(%edx,%eax,1) + + for (;;) { + 24fc: 89 de mov %ebx,%esi + 24fe: 89 cf mov %ecx,%edi + 2500: 89 eb mov %ebp,%ebx + add_wait_queue(&pagebuf_iodone_wait[cpu], + 2502: 8b 54 24 10 mov 0x10(%esp,1),%edx + 2506: 89 f0 mov %esi,%eax + 2508: e8 fc ff ff ff call 2509 + &wait); + + if (TQ_ACTIVE(pagebuf_iodone_tq[cpu])) + 250d: 39 3f cmp %edi,(%edi) + 250f: 74 06 je 2517 + __set_task_state(current, TASK_RUNNING); + 2511: c7 03 00 00 00 00 movl $0x0,(%ebx) + schedule(); + 2517: e8 fc ff ff ff call 2518 + remove_wait_queue(&pagebuf_iodone_wait[cpu], + 251c: 8b 54 24 10 mov 0x10(%esp,1),%edx + 2520: 89 f0 mov %esi,%eax + 2522: e8 fc ff ff ff call 2523 +extern void __run_task_queue(task_queue *list); + +static inline void run_task_queue(task_queue *list) +{ + if (TQ_ACTIVE(*list)) + 2527: 8b 4c 24 14 mov 0x14(%esp,1),%ecx + 252b: 8b 44 24 14 mov 0x14(%esp,1),%eax + 252f: 05 40 00 00 00 add $0x40,%eax + 2534: 39 81 40 00 00 00 cmp %eax,0x40(%ecx) + 253a: 74 09 je 2545 + __run_task_queue(list); + 253c: 50 push %eax + 253d: e8 fc ff ff ff call 253e + 2542: 83 c4 04 add $0x4,%esp + &wait); + run_task_queue(&pagebuf_iodone_tq[cpu]); + if (pb_daemons[cpu] == 0) + 2545: 8b 54 24 18 mov 0x18(%esp,1),%edx + 2549: b8 00 06 00 00 mov $0x600,%eax + 254e: 83 3c 02 00 cmpl $0x0,(%edx,%eax,1) + 2552: 74 0c je 2560 + break; + __set_current_state(TASK_INTERRUPTIBLE); + 2554: c7 03 01 00 00 00 movl $0x1,(%ebx) + 255a: eb a6 jmp 2502 + 255c: 8d 74 26 00 lea 0x0(%esi,1),%esi + } + + pb_daemons[cpu] = -1; + 2560: 8b 44 24 18 mov 0x18(%esp,1),%eax + 2564: b9 00 06 00 00 mov $0x600,%ecx + 2569: c7 04 08 ff ff ff ff movl $0xffffffff,(%eax,%ecx,1) + wake_up_interruptible(&pagebuf_iodone_wait[cpu]); + 2570: b9 01 00 00 00 mov $0x1,%ecx + 2575: ba 01 00 00 00 mov $0x1,%edx + 257a: 89 f0 mov %esi,%eax + 257c: e8 fc ff ff ff call 257d + return 0; + 2581: 31 c0 xor %eax,%eax + 2583: 5b pop %ebx + 2584: 5e pop %esi + 2585: 5f pop %edi + 2586: 5d pop %ebp + 2587: 83 c4 2c add $0x2c,%esp + 258a: c3 ret +} + 258b: 90 nop + +000000000000258c : + +/* Defines for pagebuf daemon */ +DECLARE_WAIT_QUEUE_HEAD(pbd_waitq); +STATIC int force_flush; + +STATIC void +pagebuf_daemon_wakeup( + int flag) +{ + 258c: 8b 44 24 04 mov 0x4(%esp,1),%eax + force_flush = flag; + 2590: a3 80 06 00 00 mov %eax,0x680 + if (waitqueue_active(&pbd_waitq)) { + 2595: 81 3d 08 00 00 00 08 cmpl $0x8,0x8 + 259c: 00 00 00 + 259f: 74 14 je 25b5 + wake_up_interruptible(&pbd_waitq); + 25a1: b9 01 00 00 00 mov $0x1,%ecx + 25a6: ba 01 00 00 00 mov $0x1,%edx + 25ab: b8 00 00 00 00 mov $0x0,%eax + 25b0: e8 fc ff ff ff call 25b1 + } + 25b5: c3 ret +} + 25b6: 89 f6 mov %esi,%esi + +00000000000025b8 : + +typedef void (*timeout_fn)(unsigned long); + +STATIC int +pagebuf_daemon( + void *data) +{ + int count; + page_buf_t *pb; + struct list_head *curr, *next, tmp; + struct timer_list pb_daemon_timer = + 25b8: 55 push %ebp + 25b9: 89 e5 mov %esp,%ebp + 25bb: 83 ec 20 sub $0x20,%esp + 25be: 57 push %edi + 25bf: 56 push %esi + 25c0: 53 push %ebx + 25c1: 8d 7d ec lea 0xffffffec(%ebp),%edi + 25c4: 31 c0 xor %eax,%eax + 25c6: fc cld + 25c7: ab stos %eax,%es:(%edi) + 25c8: ab stos %eax,%es:(%edi) + 25c9: ab stos %eax,%es:(%edi) + 25ca: ab stos %eax,%es:(%edi) + 25cb: ab stos %eax,%es:(%edi) + 25cc: c7 45 fc 8c 25 00 00 movl $0x258c,0xfffffffc(%ebp) + { {NULL, NULL}, 0, 0, (timeout_fn)pagebuf_daemon_wakeup }; + + /* Set up the thread */ + daemonize(); + 25d3: e8 fc ff ff ff call 25d4 + + /* Avoid signals */ + spin_lock_irq(¤t->sigmask_lock); + 25d8: fa cli + +static inline struct task_struct * get_current(void) +{ + struct task_struct *current; + __asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL)); + 25d9: b8 00 e0 ff ff mov $0xffffe000,%eax + 25de: 21 e0 and %esp,%eax + return oldval > 0; +} + +static inline void spin_lock(spinlock_t *lock) +{ + 25e0: 8d 98 5c 06 00 00 lea 0x65c(%eax),%ebx +#if SPINLOCK_DEBUG + __label__ here; +here: + if (lock->magic != SPINLOCK_MAGIC) { + 25e6: 81 7b 04 ad 4e ad de cmpl $0xdead4ead,0x4(%ebx) + 25ed: 74 1a je 2609 +printk("eip: %p\n", &&here); + 25ef: 68 e6 25 00 00 push $0x25e6 + 25f4: 68 2b 00 00 00 push $0x2b + 25f9: e8 fc ff ff ff call 25fa + BUG(); + 25fe: 0f 0b ud2a + 2600: 85 00 test %eax,(%eax) + 2602: 00 00 add %al,(%eax) + 2604: 00 00 add %al,(%eax) + } + 2606: 83 c4 08 add $0x8,%esp +#endif + __asm__ __volatile__( + 2609: f0 fe 0b lock decb (%ebx) + 260c: 0f 88 8f 08 00 00 js 2ea1 + +static inline struct task_struct * get_current(void) +{ + struct task_struct *current; + __asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL)); + 2612: b9 00 e0 ff ff mov $0xffffe000,%ecx + 2617: 21 e1 and %esp,%ecx + switch (_NSIG_WORDS) { + default: + memset(set, -1, sizeof(sigset_t)); + break; + case 2: set->sig[1] = -1; + 2619: c7 81 6c 06 00 00 ff movl $0xffffffff,0x66c(%ecx) + 2620: ff ff ff + case 1: set->sig[0] = -1; + 2623: c7 81 68 06 00 00 ff movl $0xffffffff,0x668(%ecx) + 262a: ff ff ff + All callers should have t->sigmask_lock. */ + +static inline void recalc_sigpending(struct task_struct *t) +{ + t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked); + 262d: c7 41 08 00 00 00 00 movl $0x0,0x8(%ecx) + +static inline struct task_struct * get_current(void) +{ + struct task_struct *current; + __asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL)); + 2634: 89 c8 mov %ecx,%eax + :"0" (oldval) : "memory" + +static inline void spin_unlock(spinlock_t *lock) +{ + char oldval = 1; + 2636: b2 01 mov $0x1,%dl +#if SPINLOCK_DEBUG + if (lock->magic != SPINLOCK_MAGIC) + 2638: 81 b8 60 06 00 00 ad cmpl $0xdead4ead,0x660(%eax) + 263f: 4e ad de + 2642: 74 0c je 2650 + BUG(); + 2644: 0f 0b ud2a + 2646: 69 00 00 00 00 00 imul $0x0,(%eax),%eax + 264c: 8d 74 26 00 lea 0x0(%esi,1),%esi + if (!spin_is_locked(lock)) + 2650: 8a 80 5c 06 00 00 mov 0x65c(%eax),%al + 2656: 84 c0 test %al,%al + 2658: 7e 08 jle 2662 + BUG(); + 265a: 0f 0b ud2a + 265c: 6b 00 00 imul $0x0,(%eax),%eax + 265f: 00 00 add %al,(%eax) + 2661: 00 86 91 5c 06 00 add %al,0x65c91(%esi) +#endif + __asm__ __volatile__( + 2667: 00 fb add %bh,%bl + */ + +#define __HAVE_ARCH_STRCPY +static inline char * strcpy(char * dest,const char *src) +{ + 2669: 8d b9 3e 03 00 00 lea 0x33e(%ecx),%edi + 266f: be 5f 00 00 00 mov $0x5f,%esi +int d0, d1, d2; +__asm__ __volatile__( + 2674: ac lods %ds:(%esi),%al + 2675: aa stos %al,%es:(%edi) + 2676: 84 c0 test %al,%al + 2678: 75 fa jne 2674 + sigfillset(¤t->blocked); + recalc_sigpending(current); + spin_unlock_irq(¤t->sigmask_lock); + + strcpy(current->comm, "pagebufd"); + current->flags |= PF_MEMALLOC; + 267a: 80 49 05 08 orb $0x8,0x5(%ecx) + + INIT_LIST_HEAD(&tmp); + 267e: 8d 45 e4 lea 0xffffffe4(%ebp),%eax + 2681: 89 45 e4 mov %eax,0xffffffe4(%ebp) + 2684: 89 40 04 mov %eax,0x4(%eax) + do { + if (pb_daemon->active == 1) { + 2687: 8b 15 28 00 00 00 mov 0x28,%edx + 268d: 83 3a 01 cmpl $0x1,(%edx) + 2690: 75 2d jne 26bf + del_timer(&pb_daemon_timer); + 2692: 8d 45 ec lea 0xffffffec(%ebp),%eax + 2695: 50 push %eax + 2696: e8 fc ff ff ff call 2697 + pb_daemon_timer.expires = jiffies + + 269b: a1 00 00 00 00 mov 0x0,%eax + 26a0: 03 05 00 00 00 00 add 0x0,%eax + 26a6: 89 45 f4 mov %eax,0xfffffff4(%ebp) + pb_params.p_un.flush_interval; + add_timer(&pb_daemon_timer); + 26a9: 8d 4d ec lea 0xffffffec(%ebp),%ecx + 26ac: 51 push %ecx + 26ad: e8 fc ff ff ff call 26ae + interruptible_sleep_on(&pbd_waitq); + 26b2: b8 00 00 00 00 mov $0x0,%eax + 26b7: e8 fc ff ff ff call 26b8 + } + 26bc: 83 c4 08 add $0x8,%esp + + if (pb_daemon->active == 0) { + 26bf: a1 28 00 00 00 mov 0x28,%eax + 26c4: 83 38 00 cmpl $0x0,(%eax) + 26c7: 75 0c jne 26d5 + del_timer(&pb_daemon_timer); + 26c9: 8d 45 ec lea 0xffffffec(%ebp),%eax + 26cc: 50 push %eax + 26cd: e8 fc ff ff ff call 26ce + } + 26d2: 83 c4 04 add $0x4,%esp + return oldval > 0; +} + +static inline void spin_lock(spinlock_t *lock) +{ + 26d5: 8b 1d 28 00 00 00 mov 0x28,%ebx + 26db: 83 c3 08 add $0x8,%ebx +#if SPINLOCK_DEBUG + __label__ here; +here: + if (lock->magic != SPINLOCK_MAGIC) { + 26de: 81 7b 04 ad 4e ad de cmpl $0xdead4ead,0x4(%ebx) + 26e5: 74 1a je 2701 +printk("eip: %p\n", &&here); + 26e7: 68 de 26 00 00 push $0x26de + 26ec: 68 2b 00 00 00 push $0x2b + 26f1: e8 fc ff ff ff call 26f2 + BUG(); + 26f6: 0f 0b ud2a + 26f8: 85 00 test %eax,(%eax) + 26fa: 00 00 add %al,(%eax) + 26fc: 00 00 add %al,(%eax) + } + 26fe: 83 c4 08 add $0x8,%esp +#endif + __asm__ __volatile__( + 2701: f0 fe 0b lock decb (%ebx) + 2704: 0f 88 a3 07 00 00 js 2ead + + spin_lock(&pb_daemon->pb_delwrite_lock); + + count = 0; + 270a: c7 45 e0 00 00 00 00 movl $0x0,0xffffffe0(%ebp) + list_for_each_safe(curr, next, &pb_daemon->pb_delwrite_l) { + 2711: a1 28 00 00 00 mov 0x28,%eax + 2716: 8b 70 10 mov 0x10(%eax),%esi + 2719: 83 c0 10 add $0x10,%eax + 271c: 8b 3e mov (%esi),%edi + 271e: 39 c6 cmp %eax,%esi + 2720: 0f 84 79 00 00 00 je 279f + pb = list_entry(curr, page_buf_t, pb_list); + 2726: 89 f3 mov %esi,%ebx + + PB_TRACE(pb, PB_TRACE_REC(walkq1), pagebuf_ispin(pb)); + + if ((pb->pb_flags & PBF_DELWRI) && !pagebuf_ispin(pb) && + 2728: f6 43 08 40 testb $0x40,0x8(%ebx) + 272c: 74 61 je 278f + 272e: 53 push %ebx + 272f: e8 fc ff ff ff call 2730 + 2734: 83 c4 04 add $0x4,%esp + 2737: 85 c0 test %eax,%eax + 2739: 75 54 jne 278f + 273b: f6 43 0a 08 testb $0x8,0xa(%ebx) + 273f: 74 0f je 2750 + 2741: 53 push %ebx + 2742: e8 fc ff ff ff call 2743 + 2747: 83 c4 04 add $0x4,%esp + 274a: 85 c0 test %eax,%eax + 274c: 75 41 jne 278f + 274e: 89 f6 mov %esi,%esi + (((pb->pb_flags & _PBF_LOCKABLE) == 0) || + !pagebuf_cond_lock(pb))) { + + if (!force_flush && time_before(jiffies, + 2750: 83 3d 80 06 00 00 00 cmpl $0x0,0x680 + 2757: 75 18 jne 2771 + 2759: a1 00 00 00 00 mov 0x0,%eax + 275e: 2b 83 b0 00 00 00 sub 0xb0(%ebx),%eax + 2764: 79 0b jns 2771 + PBP(pb)->pb_flushtime)) { + pagebuf_unlock(pb); + 2766: 53 push %ebx + 2767: e8 fc ff ff ff call 2768 + break; + 276c: 83 c4 04 add $0x4,%esp + 276f: eb 2e jmp 279f + * the prev/next entries already! + */ +static __inline__ void __list_del(struct list_head * prev, + struct list_head * next) +{ + 2771: 8b 46 04 mov 0x4(%esi),%eax + 2774: 8b 16 mov (%esi),%edx + next->prev = prev; + 2776: 89 42 04 mov %eax,0x4(%edx) + prev->next = next; + 2779: 89 10 mov %edx,(%eax) + 277b: 8b 45 e4 mov 0xffffffe4(%ebp),%eax + 277e: 89 70 04 mov %esi,0x4(%eax) + 2781: 89 06 mov %eax,(%esi) + 2783: 8d 4d e4 lea 0xffffffe4(%ebp),%ecx + 2786: 89 4e 04 mov %ecx,0x4(%esi) + 2789: 89 75 e4 mov %esi,0xffffffe4(%ebp) + } + + list_del(&pb->pb_list); + list_add(&pb->pb_list, &tmp); + + count++; + 278c: ff 45 e0 incl 0xffffffe0(%ebp) + 278f: 89 fe mov %edi,%esi + 2791: 8b 3e mov (%esi),%edi + 2793: a1 28 00 00 00 mov 0x28,%eax + 2798: 83 c0 10 add $0x10,%eax + 279b: 39 c6 cmp %eax,%esi + 279d: 75 87 jne 2726 + :"=q" (oldval), "=m" (lock->lock) \ + :"0" (oldval) : "memory" + +static inline void spin_unlock(spinlock_t *lock) +{ + 279f: 8b 15 28 00 00 00 mov 0x28,%edx + char oldval = 1; + 27a5: b1 01 mov $0x1,%cl +#if SPINLOCK_DEBUG + if (lock->magic != SPINLOCK_MAGIC) + 27a7: 81 7a 0c ad 4e ad de cmpl $0xdead4ead,0xc(%edx) + 27ae: 74 08 je 27b8 + BUG(); + 27b0: 0f 0b ud2a + 27b2: 69 00 00 00 00 00 imul $0x0,(%eax),%eax + if (!spin_is_locked(lock)) + 27b8: 8a 42 08 mov 0x8(%edx),%al + 27bb: 84 c0 test %al,%al + 27bd: 7e 08 jle 27c7 + BUG(); + 27bf: 0f 0b ud2a + 27c1: 6b 00 00 imul $0x0,(%eax),%eax + 27c4: 00 00 add %al,(%eax) + 27c6: 00 86 4a 08 8d 45 add %al,0x458d084a(%esi) + } + } + + spin_unlock(&pb_daemon->pb_delwrite_lock); + while (!list_empty(&tmp)) { + 27cc: e4 39 in $0x39,%al + 27ce: 45 inc %ebp + 27cf: e4 74 in $0x74,%al + 27d1: 3f aas + pb = list_entry(tmp.next, + 27d2: 8b 5d e4 mov 0xffffffe4(%ebp),%ebx + * the prev/next entries already! + */ +static __inline__ void __list_del(struct list_head * prev, + struct list_head * next) +{ + 27d5: 8b 43 04 mov 0x4(%ebx),%eax + 27d8: 8b 13 mov (%ebx),%edx + next->prev = prev; + 27da: 89 42 04 mov %eax,0x4(%edx) + prev->next = next; + 27dd: 89 10 mov %edx,(%eax) +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty on entry does not return true after this, the entry is in an undefined state. + */ +static __inline__ void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +/** + * list_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static __inline__ void list_del_init(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); + 27df: 89 1b mov %ebx,(%ebx) + 27e1: 89 5b 04 mov %ebx,0x4(%ebx) + page_buf_t, pb_list); + list_del_init(&pb->pb_list); + pb->pb_flags &= ~PBF_DELWRI; + 27e4: 8b 43 08 mov 0x8(%ebx),%eax + 27e7: 24 bf and $0xbf,%al + pb->pb_flags |= PBF_WRITE; + 27e9: 0c 02 or $0x2,%al + 27eb: 89 43 08 mov %eax,0x8(%ebx) +extern void pagebuf_terminate(void); + +static __inline__ int __pagebuf_iorequest(page_buf_t *pb) +{ + if (pb->pb_strat) + 27ee: 83 7b 54 00 cmpl $0x0,0x54(%ebx) + 27f2: 74 0c je 2800 + return pb->pb_strat(pb); + 27f4: 53 push %ebx + 27f5: 8b 43 54 mov 0x54(%ebx),%eax + 27f8: ff d0 call *%eax + 27fa: eb 0a jmp 2806 + 27fc: 8d 74 26 00 lea 0x0(%esi,1),%esi + return pagebuf_iorequest(pb); + 2800: 53 push %ebx + 2801: e8 fc ff ff ff call 2802 + 2806: 83 c4 04 add $0x4,%esp + 2809: 8d 4d e4 lea 0xffffffe4(%ebp),%ecx + 280c: 39 4d e4 cmp %ecx,0xffffffe4(%ebp) + 280f: 75 c1 jne 27d2 + + __pagebuf_iorequest(pb); + } + + if (count) + 2811: 83 7d e0 00 cmpl $0x0,0xffffffe0(%ebp) + 2815: 74 19 je 2830 +extern void __run_task_queue(task_queue *list); + +static inline void run_task_queue(task_queue *list) +{ + if (TQ_ACTIVE(*list)) + 2817: 81 3d 00 00 00 00 00 cmpl $0x0,0x0 + 281e: 00 00 00 + 2821: 74 0d je 2830 + __run_task_queue(list); + 2823: 68 00 00 00 00 push $0x0 + 2828: e8 fc ff ff ff call 2829 + 282d: 83 c4 04 add $0x4,%esp + run_task_queue(&tq_disk); + if (as_list_len > 0) + 2830: 83 3d e4 05 00 00 00 cmpl $0x0,0x5e4 + 2837: 7e 05 jle 283e + purge_addresses(); + 2839: e8 96 d8 ff ff call d4 + + force_flush = 0; + } while (pb_daemon->active == 1); + 283e: a1 28 00 00 00 mov 0x28,%eax + 2843: c7 05 80 06 00 00 00 movl $0x0,0x680 + 284a: 00 00 00 + 284d: 83 38 01 cmpl $0x1,(%eax) + 2850: 0f 84 3c fe ff ff je 2692 + + pb_daemon->active = -1; + 2856: c7 00 ff ff ff ff movl $0xffffffff,(%eax) + wake_up_interruptible(&pbd_waitq); + 285c: b9 01 00 00 00 mov $0x1,%ecx + 2861: ba 01 00 00 00 mov $0x1,%edx + 2866: b8 00 00 00 00 mov $0x0,%eax + 286b: e8 fc ff ff ff call 286c + + return 0; + 2870: 31 c0 xor %eax,%eax + 2872: 8d 65 d4 lea 0xffffffd4(%ebp),%esp + 2875: 5b pop %ebx + 2876: 5e pop %esi + 2877: 5f pop %edi + 2878: 89 ec mov %ebp,%esp + 287a: 5d pop %ebp + 287b: c3 ret + +000000000000287c : +} + +void +pagebuf_delwri_flush( + pb_target_t *target, + u_long flags, + int *pinptr) +{ + page_buf_t *pb; + struct list_head *curr, *next, tmp; + int pincount = 0; + 287c: 83 ec 10 sub $0x10,%esp + 287f: 55 push %ebp + 2880: 57 push %edi + 2881: 56 push %esi + 2882: 53 push %ebx + return oldval > 0; +} + +static inline void spin_lock(spinlock_t *lock) +{ + 2883: 8b 1d 28 00 00 00 mov 0x28,%ebx + 2889: 31 ed xor %ebp,%ebp + return oldval > 0; +} + +static inline void spin_lock(spinlock_t *lock) +{ + 288b: 83 c3 08 add $0x8,%ebx +#if SPINLOCK_DEBUG + __label__ here; +here: + if (lock->magic != SPINLOCK_MAGIC) { + 288e: 81 7b 04 ad 4e ad de cmpl $0xdead4ead,0x4(%ebx) + 2895: 74 1a je 28b1 +printk("eip: %p\n", &&here); + 2897: 68 8e 28 00 00 push $0x288e + 289c: 68 2b 00 00 00 push $0x2b + 28a1: e8 fc ff ff ff call 28a2 + BUG(); + 28a6: 0f 0b ud2a + 28a8: 85 00 test %eax,(%eax) + 28aa: 00 00 add %al,(%eax) + 28ac: 00 00 add %al,(%eax) + } + 28ae: 83 c4 08 add $0x8,%esp +#endif + __asm__ __volatile__( + 28b1: f0 fe 0b lock decb (%ebx) + 28b4: 0f 88 ff 05 00 00 js 2eb9 + + spin_lock(&pb_daemon->pb_delwrite_lock); + INIT_LIST_HEAD(&tmp); + 28ba: 8d 44 24 18 lea 0x18(%esp,1),%eax + 28be: 89 44 24 18 mov %eax,0x18(%esp,1) + 28c2: 89 40 04 mov %eax,0x4(%eax) + + list_for_each_safe(curr, next, &pb_daemon->pb_delwrite_l) { + 28c5: 8b 15 28 00 00 00 mov 0x28,%edx + 28cb: 8b 72 10 mov 0x10(%edx),%esi + 28ce: 8b 44 24 28 mov 0x28(%esp,1),%eax + 28d2: 83 c2 10 add $0x10,%edx + 28d5: 8b 0e mov (%esi),%ecx + 28d7: 89 4c 24 14 mov %ecx,0x14(%esp,1) + 28db: 89 44 24 10 mov %eax,0x10(%esp,1) + 28df: 83 64 24 10 01 andl $0x1,0x10(%esp,1) + 28e4: 39 d6 cmp %edx,%esi + 28e6: 0f 84 2c 01 00 00 je 2a18 + 28ec: 8d 74 26 00 lea 0x0(%esi,1),%esi + pb = list_entry(curr, page_buf_t, pb_list); + 28f0: 89 f3 mov %esi,%ebx + + /* + * Skip other targets, markers and in progress buffers + */ + + if ((pb->pb_flags == 0) || (pb->pb_target != target) || + 28f2: 8b 43 08 mov 0x8(%ebx),%eax + 28f5: 85 c0 test %eax,%eax + 28f7: 0f 84 01 01 00 00 je 29fe + 28fd: 8b 54 24 24 mov 0x24(%esp,1),%edx + 2901: 39 53 14 cmp %edx,0x14(%ebx) + 2904: 0f 85 f4 00 00 00 jne 29fe + 290a: a8 40 test $0x40,%al + 290c: 0f 84 ec 00 00 00 je 29fe + !(pb->pb_flags & PBF_DELWRI)) { + continue; + } + + PB_TRACE(pb, PB_TRACE_REC(walkq2), pagebuf_ispin(pb)); + if (pagebuf_ispin(pb)) { + 2912: 53 push %ebx + 2913: e8 fc ff ff ff call 2914 + 2918: 83 c4 04 add $0x4,%esp + 291b: 85 c0 test %eax,%eax + 291d: 75 16 jne 2935 + pincount++; + continue; + } + + if (flags & PBDF_TRYLOCK) { + 291f: 8b 7c 24 28 mov 0x28(%esp,1),%edi + 2923: 83 e7 02 and $0x2,%edi + 2926: 74 18 je 2940 + if (!pagebuf_cond_lock(pb)) { + 2928: 53 push %ebx + 2929: e8 fc ff ff ff call 292a + 292e: 83 c4 04 add $0x4,%esp + 2931: 85 c0 test %eax,%eax + 2933: 75 0b jne 2940 + pincount++; + 2935: 45 inc %ebp + continue; + 2936: e9 c3 00 00 00 jmp 29fe + 293b: 90 nop + 293c: 8d 74 26 00 lea 0x0(%esi,1),%esi + * the prev/next entries already! + */ +static __inline__ void __list_del(struct list_head * prev, + struct list_head * next) +{ + 2940: 8b 56 04 mov 0x4(%esi),%edx + 2943: 8b 06 mov (%esi),%eax + next->prev = prev; + 2945: 89 50 04 mov %edx,0x4(%eax) + prev->next = next; + 2948: 89 02 mov %eax,(%edx) +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty on entry does not return true after this, the entry is in an undefined state. + */ +static __inline__ void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +/** + * list_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static __inline__ void list_del_init(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); + 294a: 89 36 mov %esi,(%esi) + 294c: 89 76 04 mov %esi,0x4(%esi) + } + } + + list_del_init(&pb->pb_list); + if (flags & PBDF_WAIT) { + 294f: 83 7c 24 10 00 cmpl $0x0,0x10(%esp,1) + 2954: 74 1a je 2970 + */ +static __inline__ void __list_add(struct list_head * new, + struct list_head * prev, + struct list_head * next) +{ + 2956: 8b 44 24 18 mov 0x18(%esp,1),%eax + next->prev = new; + 295a: 89 70 04 mov %esi,0x4(%eax) + new->next = next; + 295d: 89 06 mov %eax,(%esi) + new->prev = prev; + 295f: 8d 4c 24 18 lea 0x18(%esp,1),%ecx + 2963: 89 4e 04 mov %ecx,0x4(%esi) + prev->next = new; + 2966: 89 74 24 18 mov %esi,0x18(%esp,1) + list_add(&pb->pb_list, &tmp); + pb->pb_flags &= ~PBF_ASYNC; + 296a: 80 66 08 ef andb $0xef,0x8(%esi) + 296e: 89 f6 mov %esi,%esi + :"=q" (oldval), "=m" (lock->lock) \ + :"0" (oldval) : "memory" + +static inline void spin_unlock(spinlock_t *lock) +{ + 2970: 8b 15 28 00 00 00 mov 0x28,%edx + char oldval = 1; + 2976: b1 01 mov $0x1,%cl +#if SPINLOCK_DEBUG + if (lock->magic != SPINLOCK_MAGIC) + 2978: 81 7a 0c ad 4e ad de cmpl $0xdead4ead,0xc(%edx) + 297f: 74 08 je 2989 + BUG(); + 2981: 0f 0b ud2a + 2983: 69 00 00 00 00 00 imul $0x0,(%eax),%eax + if (!spin_is_locked(lock)) + 2989: 8a 42 08 mov 0x8(%edx),%al + 298c: 84 c0 test %al,%al + 298e: 7e 08 jle 2998 + BUG(); + 2990: 0f 0b ud2a + 2992: 6b 00 00 imul $0x0,(%eax),%eax + 2995: 00 00 add %al,(%eax) + 2997: 00 86 4a 08 85 ff add %al,0xff85084a(%esi) + } + + spin_unlock(&pb_daemon->pb_delwrite_lock); + + if ((flags & PBDF_TRYLOCK) == 0) { + 299d: 75 09 jne 29a8 + pagebuf_lock(pb); + 299f: 56 push %esi + 29a0: e8 fc ff ff ff call 29a1 + } + 29a5: 83 c4 04 add $0x4,%esp + + pb->pb_flags &= ~PBF_DELWRI; + 29a8: 8b 46 08 mov 0x8(%esi),%eax + 29ab: 24 bf and $0xbf,%al + pb->pb_flags |= PBF_WRITE; + 29ad: 0c 02 or $0x2,%al + 29af: 89 46 08 mov %eax,0x8(%esi) +extern void pagebuf_terminate(void); + +static __inline__ int __pagebuf_iorequest(page_buf_t *pb) +{ + if (pb->pb_strat) + 29b2: 83 7e 54 00 cmpl $0x0,0x54(%esi) + 29b6: 74 08 je 29c0 + return pb->pb_strat(pb); + 29b8: 56 push %esi + 29b9: 8b 46 54 mov 0x54(%esi),%eax + 29bc: ff d0 call *%eax + 29be: eb 06 jmp 29c6 + return pagebuf_iorequest(pb); + 29c0: 56 push %esi + 29c1: e8 fc ff ff ff call 29c2 + 29c6: 83 c4 04 add $0x4,%esp + return oldval > 0; +} + +static inline void spin_lock(spinlock_t *lock) +{ + 29c9: 8b 1d 28 00 00 00 mov 0x28,%ebx + 29cf: 83 c3 08 add $0x8,%ebx +#if SPINLOCK_DEBUG + __label__ here; +here: + if (lock->magic != SPINLOCK_MAGIC) { + 29d2: 81 7b 04 ad 4e ad de cmpl $0xdead4ead,0x4(%ebx) + 29d9: 74 1a je 29f5 +printk("eip: %p\n", &&here); + 29db: 68 d2 29 00 00 push $0x29d2 + 29e0: 68 2b 00 00 00 push $0x2b + 29e5: e8 fc ff ff ff call 29e6 + BUG(); + 29ea: 0f 0b ud2a + 29ec: 85 00 test %eax,(%eax) + 29ee: 00 00 add %al,(%eax) + 29f0: 00 00 add %al,(%eax) + } + 29f2: 83 c4 08 add $0x8,%esp +#endif + __asm__ __volatile__( + 29f5: f0 fe 0b lock decb (%ebx) + 29f8: 0f 88 c7 04 00 00 js 2ec5 + 29fe: 8b 74 24 14 mov 0x14(%esp,1),%esi + 2a02: 8b 06 mov (%esi),%eax + 2a04: 89 44 24 14 mov %eax,0x14(%esp,1) + 2a08: a1 28 00 00 00 mov 0x28,%eax + 2a0d: 83 c0 10 add $0x10,%eax + 2a10: 39 c6 cmp %eax,%esi + 2a12: 0f 85 d8 fe ff ff jne 28f0 + :"=q" (oldval), "=m" (lock->lock) \ + :"0" (oldval) : "memory" + +static inline void spin_unlock(spinlock_t *lock) +{ + 2a18: 8b 15 28 00 00 00 mov 0x28,%edx + char oldval = 1; + 2a1e: b1 01 mov $0x1,%cl +#if SPINLOCK_DEBUG + if (lock->magic != SPINLOCK_MAGIC) + 2a20: 81 7a 0c ad 4e ad de cmpl $0xdead4ead,0xc(%edx) + 2a27: 74 08 je 2a31 + BUG(); + 2a29: 0f 0b ud2a + 2a2b: 69 00 00 00 00 00 imul $0x0,(%eax),%eax + if (!spin_is_locked(lock)) + 2a31: 8a 42 08 mov 0x8(%edx),%al + 2a34: 84 c0 test %al,%al + 2a36: 7e 08 jle 2a40 + BUG(); + 2a38: 0f 0b ud2a + 2a3a: 6b 00 00 imul $0x0,(%eax),%eax + 2a3d: 00 00 add %al,(%eax) + 2a3f: 00 86 4a 08 81 3d add %al,0x3d81084a(%esi) + ... +extern void __run_task_queue(task_queue *list); + +static inline void run_task_queue(task_queue *list) +{ + if (TQ_ACTIVE(*list)) + 2a4d: 74 0d je 2a5c + __run_task_queue(list); + 2a4f: 68 00 00 00 00 push $0x0 + 2a54: e8 fc ff ff ff call 2a55 + 2a59: 83 c4 04 add $0x4,%esp + + __pagebuf_iorequest(pb); + + spin_lock(&pb_daemon->pb_delwrite_lock); + } + + spin_unlock(&pb_daemon->pb_delwrite_lock); + + run_task_queue(&tq_disk); + + if (pinptr) + 2a5c: 83 7c 24 2c 00 cmpl $0x0,0x2c(%esp,1) + 2a61: 74 06 je 2a69 + *pinptr = pincount; + 2a63: 8b 54 24 2c mov 0x2c(%esp,1),%edx + 2a67: 89 2a mov %ebp,(%edx) + + if ((flags & PBDF_WAIT) == 0) + 2a69: 83 7c 24 10 00 cmpl $0x0,0x10(%esp,1) + 2a6e: 74 53 je 2ac3 + return; + + while (!list_empty(&tmp)) { + 2a70: 8d 4c 24 18 lea 0x18(%esp,1),%ecx + 2a74: 39 4c 24 18 cmp %ecx,0x18(%esp,1) + 2a78: 74 49 je 2ac3 + 2a7a: 8d b6 00 00 00 00 lea 0x0(%esi),%esi + pb = list_entry(tmp.next, page_buf_t, pb_list); + 2a80: 8b 5c 24 18 mov 0x18(%esp,1),%ebx + * the prev/next entries already! + */ +static __inline__ void __list_del(struct list_head * prev, + struct list_head * next) +{ + 2a84: 8b 53 04 mov 0x4(%ebx),%edx + 2a87: 8b 03 mov (%ebx),%eax + next->prev = prev; + 2a89: 89 50 04 mov %edx,0x4(%eax) + prev->next = next; + 2a8c: 89 02 mov %eax,(%edx) +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty on entry does not return true after this, the entry is in an undefined state. + */ +static __inline__ void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +/** + * list_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static __inline__ void list_del_init(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); + 2a8e: 89 1b mov %ebx,(%ebx) + 2a90: 89 5b 04 mov %ebx,0x4(%ebx) + + list_del_init(&pb->pb_list); + pagebuf_iowait(pb); + 2a93: 53 push %ebx + 2a94: e8 fc ff ff ff call 2a95 + if (!pb->pb_relse) + 2a99: 83 c4 04 add $0x4,%esp + 2a9c: 83 7b 50 00 cmpl $0x0,0x50(%ebx) + 2aa0: 75 0e jne 2ab0 + pagebuf_unlock(pb); + 2aa2: 53 push %ebx + 2aa3: e8 fc ff ff ff call 2aa4 + 2aa8: 83 c4 04 add $0x4,%esp + 2aab: 90 nop + 2aac: 8d 74 26 00 lea 0x0(%esi,1),%esi + pagebuf_rele(pb); + 2ab0: 53 push %ebx + 2ab1: e8 fc ff ff ff call 2ab2 + } + 2ab6: 83 c4 04 add $0x4,%esp + 2ab9: 8d 44 24 18 lea 0x18(%esp,1),%eax + 2abd: 39 44 24 18 cmp %eax,0x18(%esp,1) + 2ac1: 75 bd jne 2a80 +} + 2ac3: 5b pop %ebx + 2ac4: 5e pop %esi + 2ac5: 5f pop %edi + 2ac6: 5d pop %ebp + 2ac7: 83 c4 10 add $0x10,%esp + 2aca: c3 ret + 2acb: 90 nop + +0000000000002acc : + +STATIC int +pagebuf_daemon_start(void) +{ + if (!pb_daemon) { + 2acc: 55 push %ebp + 2acd: 57 push %edi + 2ace: 56 push %esi + 2acf: 53 push %ebx + 2ad0: 83 3d 28 00 00 00 00 cmpl $0x0,0x28 + 2ad7: 0f 85 dd 00 00 00 jne 2bba + int cpu; + + pb_daemon = (pagebuf_daemon_t *) + 2add: 68 f0 01 00 00 push $0x1f0 + 2ae2: 6a 1c push $0x1c + 2ae4: e8 fc ff ff ff call 2ae5 + 2ae9: 89 c1 mov %eax,%ecx + 2aeb: 89 0d 28 00 00 00 mov %ecx,0x28 + kmalloc(sizeof(pagebuf_daemon_t), GFP_KERNEL); + if (!pb_daemon) { + 2af1: 83 c4 08 add $0x8,%esp + 2af4: 85 c9 test %ecx,%ecx + 2af6: 75 0a jne 2b02 + return -1; /* error */ + 2af8: b8 ff ff ff ff mov $0xffffffff,%eax + 2afd: e9 ba 00 00 00 jmp 2bbc + } + + pb_daemon->active = 1; + 2b02: c7 01 01 00 00 00 movl $0x1,(%ecx) + pb_daemon->io_active = 1; + 2b08: c7 41 04 01 00 00 00 movl $0x1,0x4(%ecx) + pb_daemon->pb_delwri_cnt = 0; + 2b0f: c7 41 18 00 00 00 00 movl $0x0,0x18(%ecx) + pb_daemon->pb_delwrite_lock = SPIN_LOCK_UNLOCKED; + 2b16: b8 01 00 00 00 mov $0x1,%eax + 2b1b: ba ad 4e ad de mov $0xdead4ead,%edx + 2b20: 89 41 08 mov %eax,0x8(%ecx) + 2b23: 89 51 0c mov %edx,0xc(%ecx) + + INIT_LIST_HEAD(&pb_daemon->pb_delwrite_l); + 2b26: 8d 41 10 lea 0x10(%ecx),%eax + 2b29: 89 41 10 mov %eax,0x10(%ecx) + 2b2c: 89 41 14 mov %eax,0x14(%ecx) + + kernel_thread(pagebuf_daemon, (void *)pb_daemon, + 2b2f: 68 00 07 00 00 push $0x700 + 2b34: 51 push %ecx + 2b35: 68 b8 25 00 00 push $0x25b8 + 2b3a: e8 fc ff ff ff call 2b3b + CLONE_FS|CLONE_FILES|CLONE_VM); + for (cpu = 0; cpu < smp_num_cpus; cpu++) { + 2b3f: 31 db xor %ebx,%ebx + 2b41: 83 c4 0c add $0xc,%esp + 2b44: 3b 1d 00 00 00 00 cmp 0x0,%ebx + 2b4a: 7d 6e jge 2bba + 2b4c: 8d 74 26 00 lea 0x0(%esi,1),%esi + if (kernel_thread(pagebuf_iodone_daemon, + 2b50: 68 00 07 00 00 push $0x700 + 2b55: 53 push %ebx + 2b56: 68 68 23 00 00 push $0x2368 + 2b5b: e8 fc ff ff ff call 2b5c + 2b60: 83 c4 0c add $0xc,%esp + 2b63: 85 c0 test %eax,%eax + 2b65: 7d 19 jge 2b80 + (void *)(long) cpu, + CLONE_FS|CLONE_FILES|CLONE_VM) < 0) { + printk("pagebuf_daemon_start failed\n"); + 2b67: 68 68 00 00 00 push $0x68 + 2b6c: e8 fc ff ff ff call 2b6d + } else { + 2b71: 83 c4 04 add $0x4,%esp + 2b74: 8d 7b 01 lea 0x1(%ebx),%edi + 2b77: eb 37 jmp 2bb0 + 2b79: 8d b4 26 00 00 00 00 lea 0x0(%esi,1),%esi + while (!pb_daemons[cpu_logical_map(cpu)]) { + 2b80: 8d 04 9d 00 00 00 00 lea 0x0(,%ebx,4),%eax + 2b87: 89 c6 mov %eax,%esi + 2b89: bd 00 06 00 00 mov $0x600,%ebp + 2b8e: 8d 7b 01 lea 0x1(%ebx),%edi + 2b91: 83 be 00 06 00 00 00 cmpl $0x0,0x600(%esi) + 2b98: 75 16 jne 2bb0 + 2b9a: bb 00 e0 ff ff mov $0xffffe000,%ebx + 2b9f: 21 e3 and %esp,%ebx + current->policy |= SCHED_YIELD; + 2ba1: 80 4b 28 10 orb $0x10,0x28(%ebx) + schedule(); + 2ba5: e8 fc ff ff ff call 2ba6 + 2baa: 83 3c 2e 00 cmpl $0x0,(%esi,%ebp,1) + 2bae: 74 f1 je 2ba1 + 2bb0: 89 fb mov %edi,%ebx + 2bb2: 3b 1d 00 00 00 00 cmp 0x0,%ebx + 2bb8: 7c 96 jl 2b50 + } + } + } + } + return 0; + 2bba: 31 c0 xor %eax,%eax + 2bbc: 5b pop %ebx + 2bbd: 5e pop %esi + 2bbe: 5f pop %edi + 2bbf: 5d pop %ebp + 2bc0: c3 ret +} + 2bc1: 8d 76 00 lea 0x0(%esi),%esi + +0000000000002bc4 : + +/* + * pagebuf_daemon_stop + * + * Note: do not mark as __exit, it is called from pagebuf_terminate. + */ +STATIC void +pagebuf_daemon_stop(void) +{ + if (pb_daemon) { + 2bc4: a1 28 00 00 00 mov 0x28,%eax + 2bc9: 55 push %ebp + 2bca: 57 push %edi + 2bcb: 56 push %esi + 2bcc: 53 push %ebx + 2bcd: 85 c0 test %eax,%eax + 2bcf: 0f 84 ac 00 00 00 je 2c81 + int cpu; + + pb_daemon->active = 0; + 2bd5: c7 00 00 00 00 00 movl $0x0,(%eax) + pb_daemon->io_active = 0; + 2bdb: c7 40 04 00 00 00 00 movl $0x0,0x4(%eax) + + wake_up_interruptible(&pbd_waitq); + 2be2: b9 01 00 00 00 mov $0x1,%ecx + 2be7: ba 01 00 00 00 mov $0x1,%edx + 2bec: b8 00 00 00 00 mov $0x0,%eax + 2bf1: e8 fc ff ff ff call 2bf2 + while (pb_daemon->active == 0) { + 2bf6: eb 0a jmp 2c02 + interruptible_sleep_on(&pbd_waitq); + 2bf8: b8 00 00 00 00 mov $0x0,%eax + 2bfd: e8 fc ff ff ff call 2bfe + } + 2c02: a1 28 00 00 00 mov 0x28,%eax + 2c07: 83 38 00 cmpl $0x0,(%eax) + 2c0a: 74 ec je 2bf8 + for (cpu = 0; cpu < smp_num_cpus; cpu++) { + 2c0c: 31 ff xor %edi,%edi + 2c0e: 3b 3d 00 00 00 00 cmp 0x0,%edi + 2c14: 7d 53 jge 2c69 + 2c16: bd 00 06 00 00 mov $0x600,%ebp + 2c1b: 90 nop + 2c1c: 8d 74 26 00 lea 0x0(%esi,1),%esi + pb_daemons[cpu_logical_map(cpu)] = 0; + 2c20: 8d 34 bd 00 00 00 00 lea 0x0(,%edi,4),%esi + 2c27: c7 04 2e 00 00 00 00 movl $0x0,(%esi,%ebp,1) + wake_up(&pagebuf_iodone_wait[cpu_logical_map(cpu)]); + 2c2e: 89 fb mov %edi,%ebx + 2c30: c1 e3 04 shl $0x4,%ebx + 2c33: 8d 83 40 01 00 00 lea 0x140(%ebx),%eax + 2c39: b9 01 00 00 00 mov $0x1,%ecx + 2c3e: ba 03 00 00 00 mov $0x3,%edx + 2c43: e8 fc ff ff ff call 2c44 + while (pb_daemons[cpu_logical_map(cpu)] != -1) { + 2c48: 47 inc %edi + 2c49: 83 3c 2e ff cmpl $0xffffffff,(%esi,%ebp,1) + 2c4d: 74 12 je 2c61 + 2c4f: 90 nop + interruptible_sleep_on( + &pagebuf_iodone_wait[cpu_logical_map(cpu)]); + 2c50: 8d 83 40 01 00 00 lea 0x140(%ebx),%eax + 2c56: e8 fc ff ff ff call 2c57 + 2c5b: 83 3c 2e ff cmpl $0xffffffff,(%esi,%ebp,1) + 2c5f: 75 ef jne 2c50 + 2c61: 3b 3d 00 00 00 00 cmp 0x0,%edi + 2c67: 7c b7 jl 2c20 + } + } + + kfree(pb_daemon); + 2c69: a1 28 00 00 00 mov 0x28,%eax + 2c6e: 50 push %eax + 2c6f: e8 fc ff ff ff call 2c70 + pb_daemon = NULL; + 2c74: c7 05 28 00 00 00 00 movl $0x0,0x28 + 2c7b: 00 00 00 + } + 2c7e: 83 c4 04 add $0x4,%esp + 2c81: 5b pop %ebx + 2c82: 5e pop %esi + 2c83: 5f pop %edi + 2c84: 5d pop %ebp + 2c85: c3 ret +} + 2c86: 89 f6 mov %esi,%esi + +0000000000002c88 : + + +/* + * Pagebuf sysctl interface + */ + +STATIC int +pb_stats_clear_handler( + ctl_table *ctl, + int write, + struct file *filp, + void *buffer, + size_t *lenp) +{ + 2c88: 57 push %edi + 2c89: 56 push %esi + 2c8a: 53 push %ebx + 2c8b: 8b 5c 24 10 mov 0x10(%esp,1),%ebx + 2c8f: 8b 74 24 14 mov 0x14(%esp,1),%esi + 2c93: 8b 4c 24 18 mov 0x18(%esp,1),%ecx + 2c97: 8b 54 24 1c mov 0x1c(%esp,1),%edx + 2c9b: 8b 44 24 20 mov 0x20(%esp,1),%eax + int ret; + int *valp = ctl->data; + 2c9f: 8b 7b 08 mov 0x8(%ebx),%edi + + ret = proc_doulongvec_minmax(ctl, write, filp, buffer, lenp); + 2ca2: 50 push %eax + 2ca3: 52 push %edx + 2ca4: 51 push %ecx + 2ca5: 56 push %esi + 2ca6: 53 push %ebx + 2ca7: e8 fc ff ff ff call 2ca8 + 2cac: 89 c3 mov %eax,%ebx + + if (!ret && write && *valp) { + 2cae: 83 c4 14 add $0x14,%esp + 2cb1: 85 db test %ebx,%ebx + 2cb3: 75 2e jne 2ce3 + 2cb5: 85 f6 test %esi,%esi + 2cb7: 74 2a je 2ce3 + 2cb9: 83 3f 00 cmpl $0x0,(%edi) + 2cbc: 74 25 je 2ce3 + printk("XFS Clearing pbstats\n"); + 2cbe: 68 85 00 00 00 push $0x85 + 2cc3: e8 fc ff ff ff call 2cc4 + memset(&pbstats, 0, sizeof(pbstats)); + 2cc8: 83 c4 04 add $0x4,%esp + * This looks horribly ugly, but the compiler can optimize it totally, + * as we by now know that both pattern and count is constant.. + */ +static inline void * __constant_c_and_count_memset(void * s, unsigned long pattern, size_t count) +{ + 2ccb: bf 00 00 00 00 mov $0x0,%edi + switch (count) { + case 0: + return s; + case 1: + *(unsigned char *)s = pattern; + return s; + case 2: + *(unsigned short *)s = pattern; + return s; + case 3: + *(unsigned short *)s = pattern; + *(2+(unsigned char *)s) = pattern; + return s; + case 4: + *(unsigned long *)s = pattern; + return s; + } +#define COMMON(x) \ +__asm__ __volatile__( \ + "rep ; stosl" \ + x \ + : "=&c" (d0), "=&D" (d1) \ + : "a" (pattern),"0" (count/4),"1" ((long) s) \ + : "memory") +{ + int d0, d1; + switch (count % 4) { + case 0: COMMON(""); return s; + 2cd0: b9 09 00 00 00 mov $0x9,%ecx + 2cd5: 89 d8 mov %ebx,%eax + 2cd7: f3 ab repz stos %eax,%es:(%edi) + pb_params.p_un.stats_clear = 0; + 2cd9: c7 05 0c 00 00 00 00 movl $0x0,0xc + 2ce0: 00 00 00 + } + + return ret; + 2ce3: 89 d8 mov %ebx,%eax + 2ce5: 5b pop %ebx + 2ce6: 5e pop %esi + 2ce7: 5f pop %edi + 2ce8: c3 ret +} + 2ce9: 8d 76 00 lea 0x0(%esi),%esi + +0000000000002cec : + +STATIC struct ctl_table_header *pagebuf_table_header; + +STATIC ctl_table pagebuf_table[] = { + {PB_FLUSH_INT, "flush_int", &pb_params.data[0], + sizeof(ulong), 0644, NULL, &proc_doulongvec_ms_jiffies_minmax, + &sysctl_intvec, NULL, &pagebuf_min[0], &pagebuf_max[0]}, + + {PB_FLUSH_AGE, "flush_age", &pb_params.data[1], + sizeof(ulong), 0644, NULL, &proc_doulongvec_ms_jiffies_minmax, + &sysctl_intvec, NULL, &pagebuf_min[1], &pagebuf_max[1]}, + + {PB_STATS_CLEAR, "stats_clear", &pb_params.data[3], + sizeof(ulong), 0644, NULL, &pb_stats_clear_handler, + &sysctl_intvec, NULL, &pagebuf_min[3], &pagebuf_max[3]}, + +#ifdef PAGEBUF_TRACE + {PB_DEBUG, "debug", &pb_params.data[4], + sizeof(ulong), 0644, NULL, &proc_doulongvec_minmax, + &sysctl_intvec, NULL, &pagebuf_min[4], &pagebuf_max[4]}, +#endif + {0} +}; + +STATIC ctl_table pagebuf_dir_table[] = { + {VM_PAGEBUF, "pagebuf", NULL, 0, 0555, pagebuf_table}, + {0} +}; + +STATIC ctl_table pagebuf_root_table[] = { + {CTL_VM, "vm", NULL, 0, 0555, pagebuf_dir_table}, + {0} +}; + +#ifdef CONFIG_PROC_FS +STATIC int +pagebuf_readstats( + char *buffer, + char **start, + off_t offset, + int count, + int *eof, + void *data) +{ + 2cec: 55 push %ebp + 2ced: 57 push %edi + 2cee: 56 push %esi + 2cef: 53 push %ebx + 2cf0: 8b 7c 24 14 mov 0x14(%esp,1),%edi + 2cf4: 8b 6c 24 1c mov 0x1c(%esp,1),%ebp + int i, len; + + len = 0; + len += sprintf(buffer + len, "pagebuf"); + 2cf8: 68 bb 00 00 00 push $0xbb + 2cfd: 57 push %edi + 2cfe: e8 fc ff ff ff call 2cff + 2d03: 89 c3 mov %eax,%ebx + for (i = 0; i < sizeof(pbstats) / sizeof(u_int32_t); i++) { + 2d05: 31 f6 xor %esi,%esi + 2d07: 83 c4 08 add $0x8,%esp + 2d0a: 8d b6 00 00 00 00 lea 0x0(%esi),%esi + len += sprintf(buffer + len, " %u", + 2d10: 8b 04 b5 00 00 00 00 mov 0x0(,%esi,4),%eax + 2d17: 50 push %eax + 2d18: 68 c6 00 00 00 push $0xc6 + 2d1d: 8d 04 3b lea (%ebx,%edi,1),%eax + 2d20: 50 push %eax + 2d21: e8 fc ff ff ff call 2d22 + 2d26: 01 c3 add %eax,%ebx + 2d28: 83 c4 0c add $0xc,%esp + 2d2b: 46 inc %esi + 2d2c: 83 fe 08 cmp $0x8,%esi + 2d2f: 76 df jbe 2d10 + *(((u_int32_t*)&pbstats) + i)); + } + buffer[len++] = '\n'; + 2d31: c6 04 3b 0a movb $0xa,(%ebx,%edi,1) + 2d35: 43 inc %ebx + + if (offset >= len) { + 2d36: 39 dd cmp %ebx,%ebp + 2d38: 7c 16 jl 2d50 + *start = buffer; + 2d3a: 8b 44 24 18 mov 0x18(%esp,1),%eax + 2d3e: 89 38 mov %edi,(%eax) + *eof = 1; + 2d40: 8b 44 24 24 mov 0x24(%esp,1),%eax + 2d44: c7 00 01 00 00 00 movl $0x1,(%eax) + return 0; + 2d4a: 31 c0 xor %eax,%eax + 2d4c: eb 26 jmp 2d74 + 2d4e: 89 f6 mov %esi,%esi + } + *start = buffer + offset; + 2d50: 8b 44 24 18 mov 0x18(%esp,1),%eax + 2d54: 01 ef add %ebp,%edi + 2d56: 89 38 mov %edi,(%eax) + if ((len -= offset) > count) + 2d58: 29 eb sub %ebp,%ebx + 2d5a: 3b 5c 24 20 cmp 0x20(%esp,1),%ebx + 2d5e: 7f 10 jg 2d70 + return count; + *eof = 1; + 2d60: 8b 44 24 24 mov 0x24(%esp,1),%eax + 2d64: c7 00 01 00 00 00 movl $0x1,(%eax) + + return len; + 2d6a: 89 d8 mov %ebx,%eax + 2d6c: eb 06 jmp 2d74 + 2d6e: 89 f6 mov %esi,%esi + 2d70: 8b 44 24 20 mov 0x20(%esp,1),%eax + 2d74: 5b pop %ebx + 2d75: 5e pop %esi + 2d76: 5f pop %edi + 2d77: 5d pop %ebp + 2d78: c3 ret +} + 2d79: 8d 76 00 lea 0x0(%esi),%esi + +0000000000002d7c : +#endif /* CONFIG_PROC_FS */ + +STATIC void +pagebuf_shaker(void) +{ + pagebuf_daemon_wakeup(1); + 2d7c: 6a 01 push $0x1 + 2d7e: e8 09 f8 ff ff call 258c +} + 2d83: 83 c4 04 add $0x4,%esp + 2d86: c3 ret + 2d87: 90 nop + +0000000000002d88 : + + +/* + * Initialization and Termination + */ + +int __init +pagebuf_init(void) +{ + int i; + + pagebuf_table_header = register_sysctl_table(pagebuf_root_table, 1); + +#ifdef CONFIG_PROC_FS + if (proc_mkdir("fs/pagebuf", 0)) + create_proc_read_entry( + "fs/pagebuf/stat", 0, 0, pagebuf_readstats, NULL); +#endif + + pagebuf_cache = kmem_cache_create("page_buf_t", + sizeof(page_buf_private_t), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (pagebuf_cache == NULL) { + printk("pagebuf: couldn't init pagebuf cache\n"); + pagebuf_terminate(); + return -ENOMEM; + } + + if (_pagebuf_prealloc_bh(NR_RESERVED_BH) < NR_RESERVED_BH) { + printk("pagebuf: couldn't pre-allocate %d buffer heads\n", + NR_RESERVED_BH); + pagebuf_terminate(); + return -ENOMEM; + } + + init_waitqueue_head(&pb_resv_bh_wait); + + for (i = 0; i < NHASH; i++) { + spin_lock_init(&pbhash[i].pb_hash_lock); + INIT_LIST_HEAD(&pbhash[i].pb_hash); + } + +#ifdef PAGEBUF_TRACE +# if 1 + pb_trace.buf = (pagebuf_trace_t *)kmalloc( + PB_TRACE_BUFSIZE * sizeof(pagebuf_trace_t), GFP_KERNEL); +# else + /* Alternatively, for really really long trace bufs */ + pb_trace.buf = (pagebuf_trace_t *)vmalloc( + PB_TRACE_BUFSIZE * sizeof(pagebuf_trace_t)); +# endif + memset(pb_trace.buf, 0, PB_TRACE_BUFSIZE * sizeof(pagebuf_trace_t)); + pb_trace.start = 0; + pb_trace.end = PB_TRACE_BUFSIZE - 1; +#endif + + pagebuf_daemon_start(); + kmem_shake_register(pagebuf_shaker); + return 0; +} + +/* + * pagebuf_terminate. + * + * Note: do not mark as __exit, this is also called from the __init code. + */ +void +pagebuf_terminate(void) +{ + pagebuf_daemon_stop(); + 2d88: e8 37 fe ff ff call 2bc4 + + kmem_cache_destroy(pagebuf_cache); + 2d8d: a1 24 00 00 00 mov 0x24,%eax + 2d92: 50 push %eax + 2d93: e8 fc ff ff ff call 2d94 + kmem_shake_deregister(pagebuf_shaker); + 2d98: 68 7c 2d 00 00 push $0x2d7c + 2d9d: e8 fc ff ff ff call 2d9e + + unregister_sysctl_table(pagebuf_table_header); + 2da2: a1 84 06 00 00 mov 0x684,%eax + 2da7: 50 push %eax + 2da8: e8 fc ff ff ff call 2da9 +#ifdef CONFIG_PROC_FS + remove_proc_entry("fs/pagebuf/stat", NULL); + 2dad: 6a 00 push $0x0 + 2daf: 68 d5 00 00 00 push $0xd5 + 2db4: e8 fc ff ff ff call 2db5 + remove_proc_entry("fs/pagebuf", NULL); + 2db9: 6a 00 push $0x0 + 2dbb: 68 ca 00 00 00 push $0xca + 2dc0: e8 fc ff ff ff call 2dc1 +#endif +} + 2dc5: 83 c4 1c add $0x1c,%esp + 2dc8: c3 ret + +0000000000002dc9 : + 2dc9: 80 3d 40 00 00 00 00 cmpb $0x0,0x40 + 2dd0: f3 90 repz nop + 2dd2: 7e f5 jle 2dc9 + 2dd4: e9 95 d2 ff ff jmp 6e + 2dd9: 80 3d 40 00 00 00 00 cmpb $0x0,0x40 + 2de0: f3 90 repz nop + 2de2: 7e f5 jle 2dd9 + 2de4: e9 20 d3 ff ff jmp 109 + 2de9: 80 3b 00 cmpb $0x0,(%ebx) + 2dec: f3 90 repz nop + 2dee: 7e f9 jle 2de9 + 2df0: e9 25 da ff ff jmp 81a <_pagebuf_get_prealloc_bh+0x5a> + 2df5: 80 3b 00 cmpb $0x0,(%ebx) + 2df8: f3 90 repz nop + 2dfa: 7e f9 jle 2df5 + 2dfc: e9 c8 da ff ff jmp 8c9 <_pagebuf_get_prealloc_bh+0x109> + 2e01: 80 3b 00 cmpb $0x0,(%ebx) + 2e04: f3 90 repz nop + 2e06: 7e f9 jle 2e01 + 2e08: e9 c2 db ff ff jmp 9cf <_pagebuf_find+0x6b> + 2e0d: e8 fc ff ff ff call 2e0e + 2e12: e9 eb dc ff ff jmp b02 <_pagebuf_find+0x19e> + 2e17: e8 fc ff ff ff call 2e18 + 2e1c: e9 96 e1 ff ff jmp fb7 + 2e21: 80 3b 00 cmpb $0x0,(%ebx) + 2e24: f3 90 repz nop + 2e26: 7e f9 jle 2e21 + 2e28: e9 eb e1 ff ff jmp 1018 + 2e2d: 80 3f 00 cmpb $0x0,(%edi) + 2e30: f3 90 repz nop + 2e32: 7e f9 jle 2e2d + 2e34: e9 3f e2 ff ff jmp 1078 + 2e39: 80 3e 00 cmpb $0x0,(%esi) + 2e3c: f3 90 repz nop + 2e3e: 7e f9 jle 2e39 + 2e40: e9 e1 e3 ff ff jmp 1226 + 2e45: 80 3f 00 cmpb $0x0,(%edi) + 2e48: f3 90 repz nop + 2e4a: 7e f9 jle 2e45 + 2e4c: e9 19 e5 ff ff jmp 136a + 2e51: e8 fc ff ff ff call 2e52 + 2e56: e9 86 e5 ff ff jmp 13e1 + 2e5b: 80 3e 00 cmpb $0x0,(%esi) + 2e5e: f3 90 repz nop + 2e60: 7e f9 jle 2e5b + 2e62: e9 b7 e6 ff ff jmp 151e <_end_pagebuf_page_io+0x9e> + 2e67: 80 3e 00 cmpb $0x0,(%esi) + 2e6a: f3 90 repz nop + 2e6c: 7e f9 jle 2e67 + 2e6e: e9 4b e8 ff ff jmp 16be <_end_pagebuf_page_io_multi+0xba> + 2e73: e8 fc ff ff ff call 2e74 + 2e78: e9 70 f0 ff ff jmp 1eed + 2e7d: 80 3e 00 cmpb $0x0,(%esi) + 2e80: f3 90 repz nop + 2e82: 7e f9 jle 2e7d + 2e84: e9 ca f3 ff ff jmp 2253 + 2e89: 80 3e 00 cmpb $0x0,(%esi) + 2e8c: f3 90 repz nop + 2e8e: 7e f9 jle 2e89 + 2e90: e9 7e f4 ff ff jmp 2313 + 2e95: 80 3b 00 cmpb $0x0,(%ebx) + 2e98: f3 90 repz nop + 2e9a: 7e f9 jle 2e95 + 2e9c: e9 48 f5 ff ff jmp 23e9 + 2ea1: 80 3b 00 cmpb $0x0,(%ebx) + 2ea4: f3 90 repz nop + 2ea6: 7e f9 jle 2ea1 + 2ea8: e9 5c f7 ff ff jmp 2609 + 2ead: 80 3b 00 cmpb $0x0,(%ebx) + 2eb0: f3 90 repz nop + 2eb2: 7e f9 jle 2ead + 2eb4: e9 48 f8 ff ff jmp 2701 + 2eb9: 80 3b 00 cmpb $0x0,(%ebx) + 2ebc: f3 90 repz nop + 2ebe: 7e f9 jle 2eb9 + 2ec0: e9 ec f9 ff ff jmp 28b1 + 2ec5: 80 3b 00 cmpb $0x0,(%ebx) + 2ec8: f3 90 repz nop + 2eca: 7e f9 jle 2ec5 + 2ecc: e9 24 fb ff ff jmp 29f5 +Disassembly of section .text.init: + +0000000000000000 : + 0: 55 push %ebp + 1: 57 push %edi + 2: 56 push %esi + 3: 53 push %ebx + 4: 6a 01 push $0x1 + 6: 68 60 01 00 00 push $0x160 + b: e8 fc ff ff ff call c + 10: a3 84 06 00 00 mov %eax,0x684 + 15: 6a 00 push $0x0 + 17: 68 ca 00 00 00 push $0xca + 1c: e8 fc ff ff ff call 1d + 21: 83 c4 10 add $0x10,%esp + 24: 85 c0 test %eax,%eax + 26: 74 23 je 4b + 28: 6a 00 push $0x0 + 2a: 6a 00 push $0x0 + 2c: 68 d5 00 00 00 push $0xd5 + 31: e8 fc ff ff ff call 32 + 36: 83 c4 0c add $0xc,%esp + 39: 85 c0 test %eax,%eax + 3b: 74 0e je 4b + 3d: c7 40 38 ec 2c 00 00 movl $0x2cec,0x38(%eax) + 44: c7 40 34 00 00 00 00 movl $0x0,0x34(%eax) +{ +#if SPINLOCK_DEBUG + __label__ here; +here: + if (lock->magic != SPINLOCK_MAGIC) { + 4b: 6a 00 push $0x0 + 4d: 6a 00 push $0x0 + 4f: 68 00 20 00 00 push $0x2000 +printk("eip: %p\n", &&here); + 54: 6a 00 push $0x0 + 56: 68 cc 00 00 00 push $0xcc + 5b: 68 e5 00 00 00 push $0xe5 + 60: e8 fc ff ff ff call 61 + BUG(); + 65: a3 24 00 00 00 mov %eax,0x24 + 6a: 83 c4 18 add $0x18,%esp + } + 6d: 85 c0 test %eax,%eax +#endif + __asm__ __volatile__( + 6f: 75 1f jne 90 + 71: 68 00 01 00 00 push $0x100 + 76: e8 fc ff ff ff call 77 + 7b: e8 fc ff ff ff call 7c + 80: b8 f4 ff ff ff mov $0xfffffff4,%eax + 85: e9 a9 00 00 00 jmp 133 + 8a: 8d b6 00 00 00 00 lea 0x0(%esi),%esi + 90: 6a 40 push $0x40 + 92: e8 6c 07 00 00 call 803 <_pagebuf_get_prealloc_bh+0x43> + 97: 83 c4 04 add $0x4,%esp + 9a: 83 f8 3f cmp $0x3f,%eax + 9d: 7f 21 jg c0 + 9f: 6a 40 push $0x40 + :"=q" (oldval), "=m" (lock->lock) \ + :"0" (oldval) : "memory" + +static inline void spin_unlock(spinlock_t *lock) +{ + a1: 68 40 01 00 00 push $0x140 + char oldval = 1; +#if SPINLOCK_DEBUG + if (lock->magic != SPINLOCK_MAGIC) + a6: e8 fc ff ff ff call a7 + ab: e8 fc ff ff ff call ac + b0: b8 f4 ff ff ff mov $0xfffffff4,%eax + BUG(); + b5: 83 c4 08 add $0x8,%esp + b8: eb 7c jmp 136 + if (!spin_is_locked(lock)) + ba: 8d b6 00 00 00 00 lea 0x0(%esi),%esi + c0: b8 01 00 00 00 mov $0x1,%eax + BUG(); + c5: ba ad 4e ad de mov $0xdead4ead,%edx +#endif + __asm__ __volatile__( + ca: a3 40 03 00 00 mov %eax,0x340 + cf: 89 15 44 03 00 00 mov %edx,0x344 + d5: c7 05 48 03 00 00 48 movl $0x348,0x348 + dc: 03 00 00 + df: c7 05 4c 03 00 00 48 movl $0x348,0x34c + e6: 03 00 00 +{ +#if SPINLOCK_DEBUG + __label__ here; +here: + if (lock->magic != SPINLOCK_MAGIC) { + e9: bd 6c 03 00 00 mov $0x36c,%ebp + ee: bb 60 03 00 00 mov $0x360,%ebx +printk("eip: %p\n", &&here); + f3: 89 df mov %ebx,%edi + f5: 31 c9 xor %ecx,%ecx + f7: be 1f 00 00 00 mov $0x1f,%esi + fc: 8d 74 26 00 lea 0x0(%esi,1),%esi + BUG(); + 100: b8 01 00 00 00 mov $0x1,%eax + 105: ba ad 4e ad de mov $0xdead4ead,%edx + } +#endif + __asm__ __volatile__( + 10a: 89 44 0d 00 mov %eax,0x0(%ebp,%ecx,1) + 10e: 89 54 0d 04 mov %edx,0x4(%ebp,%ecx,1) + 112: 89 1c 0f mov %ebx,(%edi,%ecx,1) + 115: 89 5c 0f 04 mov %ebx,0x4(%edi,%ecx,1) + 119: 83 c3 14 add $0x14,%ebx + 11c: 83 c1 14 add $0x14,%ecx + 11f: 4e dec %esi + 120: 79 de jns 100 + 122: e8 c8 2a 00 00 call 2bef + 127: 68 7c 2d 00 00 push $0x2d7c + 12c: e8 fc ff ff ff call 12d + :"0" (oldval) : "memory" + +static inline void spin_unlock(spinlock_t *lock) +{ + char oldval = 1; + 131: 31 c0 xor %eax,%eax +#if SPINLOCK_DEBUG + if (lock->magic != SPINLOCK_MAGIC) + 133: 83 c4 04 add $0x4,%esp + 136: 5b pop %ebx + 137: 5e pop %esi + 138: 5f pop %edi + 139: 5d pop %ebp + 13a: c3 ret diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/pagebuf/page_buf_inline.h linux-2.4-xfs/fs/xfs/pagebuf/page_buf_inline.h --- linux-2.4.19/fs/xfs/pagebuf/page_buf_inline.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/pagebuf/page_buf_inline.h Tue Aug 6 17:34:28 2002 @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "page_buf.h" + +/* + * pagebuf_cond_lock + * + * pagebuf_cond_lock locks a buffer object, if it is not already locked. + * Note that this in no way + * locks the underlying pages, so it is only useful for synchronizing + * concurrent use of page buffer objects, not for synchronizing independent + * access to the underlying pages. + */ +static __inline int +pagebuf_cond_lock( + page_buf_t *pb) +{ + int locked; + + assert(pb->pb_flags & _PBF_LOCKABLE); + + locked = down_trylock(&PBP(pb)->pb_sema) == 0; + if (locked) { + PB_SET_OWNER(pb); + PB_TRACE(pb, PB_TRACE_REC(condlck), locked); + return 0; + } + + PB_TRACE(pb, PB_TRACE_REC(condlck), 0); + return -EBUSY; +} + +/* + * pagebuf_is_locked + * + * pagebuf_is_locked tests if the buffer is locked, return 1 if locked + * and 0 if not. This routine is useful only for assertions that + * the buffer is locked, since the state could change at any time + * if the buffer is not locked. + */ +static __inline int +pagebuf_is_locked( + page_buf_t *pb) +{ + assert(pb->pb_flags & _PBF_LOCKABLE); + + return (atomic_read(&PBP(pb)->pb_sema.count) <= 0); +} + +/* + * pagebuf_lock_value + * + * Return lock value for a pagebuf + */ +static __inline int +pagebuf_lock_value( + page_buf_t *pb) +{ + assert(pb->pb_flags & _PBF_LOCKABLE); + + return (atomic_read(&PBP(pb)->pb_sema.count)); +} + +/* + * pagebuf_lock + * + * pagebuf_lock locks a buffer object. Note that this in no way + * locks the underlying pages, so it is only useful for synchronizing + * concurrent use of page buffer objects, not for synchronizing independent + * access to the underlying pages. + */ +static __inline int +pagebuf_lock( + page_buf_t *pb) +{ + assert(pb->pb_flags & _PBF_LOCKABLE); + + PB_TRACE(pb, PB_TRACE_REC(lock), 0); + if (atomic_read(&PBP(pb)->pb_io_remaining)) + run_task_queue(&tq_disk); + down(&PBP(pb)->pb_sema); + PB_SET_OWNER(pb); + PB_TRACE(pb, PB_TRACE_REC(locked), 0); + return 0; +} + +/* + * pagebuf_unlock + * + * pagebuf_unlock releases the lock on the buffer object created by + * pagebuf_lock or pagebuf_cond_lock (not any + * pinning of underlying pages created by pagebuf_pin). + */ +static __inline void +pagebuf_unlock( + page_buf_t *pb) +{ + assert(pb->pb_flags & _PBF_LOCKABLE); + + PB_CLEAR_OWNER(pb); + up(&PBP(pb)->pb_sema); + PB_TRACE(pb, PB_TRACE_REC(unlock), 0); +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/pagebuf/page_buf_internal.h linux-2.4-xfs/fs/xfs/pagebuf/page_buf_internal.h --- linux-2.4.19/fs/xfs/pagebuf/page_buf_internal.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/pagebuf/page_buf_internal.h Wed Sep 4 22:35:19 2002 @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * Written by Steve Lord at SGI + */ + +#ifndef __PAGE_BUF_PRIVATE_H__ +#define __PAGE_BUF_PRIVATE_H__ + +#include "page_buf.h" + +#define _PAGE_BUF_INTERNAL_ +#define PB_DEFINE_TRACES +#include "page_buf_trace.h" + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,9) +#define page_buffers(page) ((page)->buffers) +#define page_has_buffers(page) ((page)->buffers) +#endif + +typedef struct page_buf_private_s { + page_buf_t pb_common; /* public part of structure */ + struct semaphore pb_sema; /* semaphore for lockables */ + unsigned long pb_flushtime; /* time to flush pagebuf */ + atomic_t pb_io_remaining;/* #outstanding I/O requests */ + atomic_t pb_pin_count; /* pin count */ + wait_queue_head_t pb_waiters; /* unpin waiters */ +#ifdef PAGEBUF_LOCK_TRACKING + int pb_last_holder; +#endif +} page_buf_private_t; + +#define PBC(pb) (&((pb)->pb_common)) +#define PBP(pb) ((page_buf_private_t *) (pb)) + +#ifdef PAGEBUF_LOCK_TRACKING +#define PB_SET_OWNER(pb) (PBP(pb)->pb_last_holder = current->pid) +#define PB_CLEAR_OWNER(pb) (PBP(pb)->pb_last_holder = -1) +#define PB_GET_OWNER(pb) (PBP(pb)->pb_last_holder) +#else +#define PB_SET_OWNER(pb) +#define PB_CLEAR_OWNER(pb) +#define PB_GET_OWNER(pb) +#endif /* PAGEBUF_LOCK_TRACKING */ + +/* Tracing utilities for pagebuf */ +typedef struct { + int event; + unsigned long pb; + page_buf_flags_t flags; + unsigned short hold; + unsigned short lock_value; + void *task; + void *misc; + void *ra; + loff_t offset; + size_t size; +} pagebuf_trace_t; + +struct pagebuf_trace_buf { + pagebuf_trace_t *buf; + volatile int start; + volatile int end; +}; + +#define PB_TRACE_BUFSIZE 1024 +#define CIRC_INC(i) (((i) + 1) & (PB_TRACE_BUFSIZE - 1)) + +typedef struct pagebuf_daemon { + int active; + int io_active; + spinlock_t pb_delwrite_lock; + struct list_head pb_delwrite_l; + int pb_delwri_cnt; +} pagebuf_daemon_t; + +/* + * Tunable pagebuf parameters + */ + +#define P_PARAM 4 + +typedef union pagebuf_param { + struct { + ulong flush_interval; /* interval between runs of the + * delwri flush daemon. */ + ulong age_buffer; /* time for buffer to age before + * we flush it. */ + ulong debug; /* debug tracing on or off */ + ulong stats_clear; /* clear the pagebuf stats */ + } p_un; + ulong data[P_PARAM]; +} pagebuf_param_t; + +enum { + PB_FLUSH_INT = 1, + PB_FLUSH_AGE = 2, + PB_STATS_CLEAR = 3, + PB_DEBUG = 4 +}; + +extern pagebuf_param_t pb_params; + +/* + * Pagebuf statistics + */ + +struct pbstats { + u_int32_t pb_get; + u_int32_t pb_create; + u_int32_t pb_get_locked; + u_int32_t pb_get_locked_waited; + u_int32_t pb_busy_locked; + u_int32_t pb_miss_locked; + u_int32_t pb_page_retries; + u_int32_t pb_page_found; + u_int32_t pb_get_read; +}; + +extern struct pbstats pbstats; + +#define PB_STATS_INC(count) ( count ++ ) + +#undef assert +#ifdef PAGEBUF_DEBUG +# define assert(expr) \ + if (!(expr)) { \ + printk("Assertion failed: %s\n%s::%s line %d\n",\ + #expr,__FILE__,__FUNCTION__,__LINE__); \ + BUG(); \ + } +#else +# define assert(x) do { } while (0) +#endif + +#ifndef STATIC +# define STATIC static +#endif + +#endif /* __PAGE_BUF_PRIVATE_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/pagebuf/page_buf_locking.c linux-2.4-xfs/fs/xfs/pagebuf/page_buf_locking.c --- linux-2.4.19/fs/xfs/pagebuf/page_buf_locking.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/pagebuf/page_buf_locking.c Wed Aug 21 01:24:17 2002 @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * Portions Copyright (c) 2002 Christoph Hellwig. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * page_buf_locking.c + * + * The page_buf module provides an abstract buffer cache model on top of + * the Linux page cache. Cached blocks for a file are hashed to the + * inode for that file, and can be held dirty in delayed write mode in + * the page cache. Cached metadata blocks for a file system are hashed + * to the inode for the mounted device. The page_buf module assembles + * buffer (page_buf_t) objects on demand to aggregate such cached pages + * for I/O. The page_buf_locking module adds support for locking such + * page buffers. + * + * Written by Steve Lord at SGI + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "page_buf_internal.h" + +#ifndef EVMS_MAJOR +#define EVMS_MAJOR 117 +#endif + +/* + * pagebuf_cond_lock + * + * pagebuf_cond_lock locks a buffer object, if it is not already locked. + * Note that this in no way + * locks the underlying pages, so it is only useful for synchronizing + * concurrent use of page buffer objects, not for synchronizing independent + * access to the underlying pages. + */ +int +pagebuf_cond_lock( /* lock buffer, if not locked */ + /* returns -EBUSY if locked) */ + page_buf_t *pb) +{ + int locked; + + assert(pb->pb_flags & _PBF_LOCKABLE); + + locked = down_trylock(&PBP(pb)->pb_sema) == 0; + if (locked) { + PB_SET_OWNER(pb); + } + + PB_TRACE(pb, PB_TRACE_REC(condlck), locked); + + return(locked ? 0 : -EBUSY); +} + +/* + * pagebuf_lock_value + * + * Return lock value for a pagebuf + */ +int +pagebuf_lock_value( + page_buf_t *pb) +{ + assert(pb->pb_flags & _PBF_LOCKABLE); + return(atomic_read(&PBP(pb)->pb_sema.count)); +} + +/* + * pagebuf_lock + * + * pagebuf_lock locks a buffer object. Note that this in no way + * locks the underlying pages, so it is only useful for synchronizing + * concurrent use of page buffer objects, not for synchronizing independent + * access to the underlying pages. + */ +int +pagebuf_lock( + page_buf_t *pb) +{ + assert(pb->pb_flags & _PBF_LOCKABLE); + + PB_TRACE(pb, PB_TRACE_REC(lock), 0); + if (atomic_read(&PBP(pb)->pb_io_remaining)) + run_task_queue(&tq_disk); + down(&PBP(pb)->pb_sema); + PB_SET_OWNER(pb); + PB_TRACE(pb, PB_TRACE_REC(locked), 0); + return 0; +} + +/* + * pagebuf_lock_disable + * + * pagebuf_lock_disable disables buffer object locking for an inode. + * remove_super() does a blkdev_put for us on the data device, hence + * the do_blkdev_put argument. + */ +void +pagebuf_lock_disable( + pb_target_t *target, + int do_blkdev_put) +{ + pagebuf_delwri_flush(target, PBDF_WAIT, NULL); + if (do_blkdev_put) + blkdev_put(target->pbr_bdev, BDEV_FS); + kfree(target); +} + +/* + * pagebuf_lock_enable + * + * get_sb_bdev() does a blkdev_get for us on the data device, hence + * the do_blkdev_get argument. + */ +pb_target_t * +pagebuf_lock_enable( + dev_t dev, + int do_blkdev_get) +{ + struct block_device *bdev; + pb_target_t *target; + int error = -ENOMEM; + + target = kmalloc(sizeof(pb_target_t), GFP_KERNEL); + if (unlikely(!target)) + return ERR_PTR(error); + + bdev = bdget(dev); + if (unlikely(!bdev)) + goto fail; + + if (do_blkdev_get) { + error = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_FS); + if (unlikely(error)) + goto fail; + } + + target->pbr_dev = dev; + target->pbr_kdev = to_kdev_t(dev); + target->pbr_bdev = bdev; + target->pbr_mapping = bdev->bd_inode->i_mapping; + + pagebuf_target_blocksize(target, PAGE_CACHE_SIZE); + + if ((MAJOR(dev) == MD_MAJOR) || (MAJOR(dev) == EVMS_MAJOR)) + target->pbr_flags = PBR_ALIGNED_ONLY; + else if (MAJOR(dev) == LVM_BLK_MAJOR) + target->pbr_flags = PBR_SECTOR_ONLY; + else + target->pbr_flags = 0; + + return target; + +fail: + kfree(target); + return ERR_PTR(error); +} + +void +pagebuf_target_blocksize( + pb_target_t *target, + unsigned int blocksize) +{ + target->pbr_blocksize = blocksize; + target->pbr_blocksize_bits = ffs(blocksize) - 1; +} + +void +pagebuf_target_clear( + pb_target_t *target) +{ + destroy_buffers(target->pbr_kdev); + truncate_inode_pages(target->pbr_mapping, 0LL); +} + +/* + * pagebuf_unlock + * + * pagebuf_unlock releases the lock on the buffer object created by + * pagebuf_lock or pagebuf_cond_lock (not any + * pinning of underlying pages created by pagebuf_pin). + */ +void +pagebuf_unlock( /* unlock buffer */ + page_buf_t *pb) /* buffer to unlock */ +{ + assert(pb->pb_flags & _PBF_LOCKABLE); + PB_CLEAR_OWNER(pb); + up(&PBP(pb)->pb_sema); + PB_TRACE(pb, PB_TRACE_REC(unlock), 0); +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/pagebuf/page_buf_trace.h linux-2.4-xfs/fs/xfs/pagebuf/page_buf_trace.h --- linux-2.4.19/fs/xfs/pagebuf/page_buf_trace.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/pagebuf/page_buf_trace.h Wed Jul 10 23:14:43 2002 @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#ifndef __PAGEBUF_TRACE__ +#define __PAGEBUF_TRACE__ + +#ifdef PB_DEFINE_TRACES +#define PB_TRACE_START typedef enum { +#define PB_TRACE_REC(x) pb_trace_point_##x +#define PB_TRACE_END } pb_trace_var_t; +#else +#define PB_TRACE_START static char *event_names[] = { +#define PB_TRACE_REC(x) #x +#define PB_TRACE_END }; +#endif + +PB_TRACE_START +PB_TRACE_REC(get), +PB_TRACE_REC(get_obj), +PB_TRACE_REC(free_obj), +PB_TRACE_REC(look_pg), +PB_TRACE_REC(get_read), +PB_TRACE_REC(no_daddr), +PB_TRACE_REC(hold), +PB_TRACE_REC(rele), +PB_TRACE_REC(done), +PB_TRACE_REC(ioerror), +PB_TRACE_REC(iostart), +PB_TRACE_REC(end_io), +PB_TRACE_REC(do_io), +PB_TRACE_REC(ioreq), +PB_TRACE_REC(iowait), +PB_TRACE_REC(iowaited), +PB_TRACE_REC(free_lk), +PB_TRACE_REC(freed_l), +PB_TRACE_REC(cmp), +PB_TRACE_REC(get_lk), +PB_TRACE_REC(got_lk), +PB_TRACE_REC(skip), +PB_TRACE_REC(lock), +PB_TRACE_REC(locked), +PB_TRACE_REC(unlock), +PB_TRACE_REC(avl_ret), +PB_TRACE_REC(condlck), +PB_TRACE_REC(avl_ins), +PB_TRACE_REC(walkq1), +PB_TRACE_REC(walkq2), +PB_TRACE_REC(walkq3), +PB_TRACE_REC(delwri_q), +PB_TRACE_REC(delwri_uq), +PB_TRACE_REC(pin), +PB_TRACE_REC(unpin), +PB_TRACE_REC(file_write), +PB_TRACE_REC(external), +PB_TRACE_END + +extern void pb_trace_func(page_buf_t *, int, void *, void *); +#ifdef PAGEBUF_TRACE +# define PB_TRACE(pb, event, misc) \ + pb_trace_func(pb, event, (void *) misc, \ + (void *)__builtin_return_address(0)) +#else +# define PB_TRACE(pb, event, misc) do { } while (0) +#endif + +#endif /* __PAGEBUF_TRACE__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/support/Makefile linux-2.4-xfs/fs/xfs/support/Makefile --- linux-2.4.19/fs/xfs/support/Makefile Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/support/Makefile Fri Aug 23 18:23:29 2002 @@ -0,0 +1,54 @@ +# +# Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write the Free Software Foundation, Inc., 59 +# Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, +# Mountain View, CA 94043, or: +# +# http://www.sgi.com +# +# For further information regarding this notice, see: +# +# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ +# + +EXTRA_CFLAGS += -I.. + +ifeq ($(CONFIG_XFS_DEBUG),y) + EXTRA_CFLAGS += -DDEBUG +endif + +O_TARGET := support_xfs.o +ifneq ($(MAKECMDGOALS),modules_install) + obj-m := $(O_TARGET) +endif + +export-objs := ktrace.o + +obj-y := debug.o \ + kmem.o \ + ktrace.o \ + move.o \ + mrlock.o \ + qsort.o \ + uuid.o + +include $(TOPDIR)/Rules.make diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/support/Makefile.in linux-2.4-xfs/fs/xfs/support/Makefile.in --- linux-2.4.19/fs/xfs/support/Makefile.in Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/support/Makefile.in Wed Jul 10 23:14:43 2002 @@ -0,0 +1,39 @@ +# +# Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write the Free Software Foundation, Inc., 59 +# Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, +# Mountain View, CA 94043, or: +# +# http://www.sgi.com +# +# For further information regarding this notice, see: +# +# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ +# + +expsyms(ktrace.o) +objlink(support_xfs.o debug.o kmem.o ktrace.o move.o mrlock.o qsort.o uuid.o) + +# No select() for support_xfs.o in this directory. It is a sub-component of XFS, +# see fs/xfs/Makefile.in for the objlink. + +extra_cflags_all($(XFS_EXTRA_CFLAGS)) diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/support/atomic.h linux-2.4-xfs/fs/xfs/support/atomic.h --- linux-2.4.19/fs/xfs/support/atomic.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/support/atomic.h Wed Sep 4 22:35:42 2002 @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_ATOMIC_H__ +#define __XFS_SUPPORT_ATOMIC_H__ + +#include +#include +#include +#include +#include +#include + +/* + * This is used for two variables in XFS, one of which is a debug trace + * buffer index. They are not accessed via any other atomic operations + * so this is safe. All other atomic increments and decrements in XFS + * now use the linux built in functions. + */ + +extern spinlock_t Atomic_spin; + +static __inline__ int atomicIncWithWrap(int *ip, int val) +{ + unsigned long flags; + int ret; + spin_lock_irqsave(&Atomic_spin, flags); + ret = *ip; + (*ip)++; + if (*ip == val) *ip = 0; + spin_unlock_irqrestore(&Atomic_spin, flags); + return ret; +} + +#endif /* __XFS_SUPPORT_ATOMIC_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/support/debug.c linux-2.4-xfs/fs/xfs/support/debug.c --- linux-2.4.19/fs/xfs/support/debug.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/support/debug.c Thu Aug 29 03:39:02 2002 @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "debug.h" + +#include +#include +#include + +int doass = 1; +static char message[256]; /* keep it off the stack */ +static spinlock_t xfs_err_lock = SPIN_LOCK_UNLOCKED; + +void +assfail(char *a, char *f, int l) +{ + printk("XFS assertion failed: %s, file: %s, line: %d\n", a, f, l); + BUG(); +} + +#ifdef DEBUG + +unsigned long +random(void) +{ + static unsigned long RandomValue = 1; + /* cycles pseudo-randomly through all values between 1 and 2^31 - 2 */ + register long rv = RandomValue; + register long lo; + register long hi; + + hi = rv / 127773; + lo = rv % 127773; + rv = 16807 * lo - 2836 * hi; + if( rv <= 0 ) rv += 2147483647; + return( RandomValue = rv ); +} + +int +get_thread_id(void) +{ + return current->pid; +} + +# define xdprintk(format...) printk(format) +#else +# define xdprintk(format...) do { } while (0) +#endif + +void +cmn_err(register int level, char *fmt, ...) +{ + char *fp = fmt; + va_list ap; + + spin_lock(&xfs_err_lock); + va_start(ap, fmt); + if (*fmt == '!') fp++; + vsprintf(message, fp, ap); + switch (level) { + case CE_CONT: + printk("%s", message); + break; + case CE_DEBUG: + xdprintk("%s", message); + break; + default: + printk("%s\n", message); + break; + } + va_end(ap); + spin_unlock(&xfs_err_lock); + + if (level == CE_PANIC) + BUG(); +} + + +void +icmn_err(register int level, char *fmt, va_list ap) +{ + spin_lock(&xfs_err_lock); + vsprintf(message, fmt, ap); + switch (level) { + case CE_CONT: + printk("%s", message); + break; + case CE_DEBUG: + xdprintk("%s", message); + break; + default: + printk("cmn_err level %d ", level); + printk("%s\n", message); + break; + } + spin_unlock(&xfs_err_lock); +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/support/debug.h linux-2.4-xfs/fs/xfs/support/debug.h --- linux-2.4.19/fs/xfs/support/debug.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/support/debug.h Sat Jul 27 23:32:57 2002 @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_DEBUG_H__ +#define __XFS_SUPPORT_DEBUG_H__ + +#include + +#define CE_DEBUG 7 /* debug */ +#define CE_CONT 6 /* continuation */ +#define CE_NOTE 5 /* notice */ +#define CE_WARN 4 /* warning */ +#define CE_ALERT 1 /* alert */ +#define CE_PANIC 0 /* panic */ + +extern void icmn_err(int, char *, va_list); +extern void cmn_err(int, char *, ...); + +#ifdef DEBUG +# ifdef lint +# define ASSERT(EX) ((void)0) /* avoid "constant in conditional" babble */ +# else +# define ASSERT(EX) ((!doass||(EX))?((void)0):assfail(#EX, __FILE__, __LINE__)) +# endif /* lint */ +#else +# define ASSERT(x) ((void)0) +#endif + +extern int doass; /* dynamically turn off asserts */ +extern void assfail(char *, char *, int); +#ifdef DEBUG +extern unsigned long random(void); +extern int get_thread_id(void); +#endif + +#define ASSERT_ALWAYS(EX) ((EX)?((void)0):assfail(#EX, __FILE__, __LINE__)) +#define debug_stop_all_cpus(param) /* param is "cpumask_t *" */ + +#endif /* __XFS_SUPPORT_DEBUG_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/support/kmem.c linux-2.4-xfs/fs/xfs/support/kmem.c --- linux-2.4.19/fs/xfs/support/kmem.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/support/kmem.c Fri Aug 23 14:22:47 2002 @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include +#include +#include +#include + +#include "time.h" +#include "kmem.h" + +#define DEF_PRIORITY (6) +#define MAX_SLAB_SIZE 0x10000 + +static __inline unsigned int flag_convert(int flags) +{ +#if DEBUG + if (unlikely(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS))) { + printk(KERN_WARNING + "XFS: memory allocation with wrong flags (%x)\n", flags); + BUG(); + } +#endif + + if (flags & KM_NOSLEEP) + return GFP_ATOMIC; + /* If we're in a transaction, FS activity is not ok */ + else if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + return GFP_NOFS; + else + return GFP_KERNEL; +} + +#define MAX_SHAKE 8 + +static kmem_shake_func_t shake_list[MAX_SHAKE]; +static DECLARE_MUTEX(shake_sem); + +void kmem_shake_register(kmem_shake_func_t sfunc) +{ + int i; + + down(&shake_sem); + for (i = 0; i < MAX_SHAKE; i++) { + if (shake_list[i] == NULL) { + shake_list[i] = sfunc; + break; + } + } + if (i == MAX_SHAKE) + BUG(); + up(&shake_sem); +} + +void kmem_shake_deregister(kmem_shake_func_t sfunc) +{ + int i; + + down(&shake_sem); + for (i = 0; i < MAX_SHAKE; i++) { + if (shake_list[i] == sfunc) + break; + } + if (i == MAX_SHAKE) + BUG(); + for (; i < MAX_SHAKE - 1; i++) { + shake_list[i] = shake_list[i+1]; + } + shake_list[i] = NULL; + up(&shake_sem); +} + +static __inline__ void kmem_shake(void) +{ + int i; + + down(&shake_sem); + for (i = 0; i < MAX_SHAKE && shake_list[i]; i++) + (*shake_list[i])(); + up(&shake_sem); + delay(10); +} + +void * +kmem_alloc(size_t size, int flags) +{ + int shrink = DEF_PRIORITY; /* # times to try to shrink cache */ + void *rval; + +repeat: + if (MAX_SLAB_SIZE < size) { + /* Avoid doing filesystem sensitive stuff to get this */ + rval = __vmalloc(size, flag_convert(flags), PAGE_KERNEL); + } else { + rval = kmalloc(size, flag_convert(flags)); + } + + if (rval || (flags & KM_NOSLEEP)) + return rval; + + /* + * KM_SLEEP callers don't expect a failure + */ + if (shrink) { + kmem_shake(); + + shrink--; + goto repeat; + } + + rval = __vmalloc(size, flag_convert(flags), PAGE_KERNEL); + if (!rval && !(flags & KM_SLEEP)) + panic("kmem_alloc: NULL memory on KM_SLEEP request!"); + + return rval; +} + +void * +kmem_zalloc(size_t size, int flags) +{ + void *ptr; + + ptr = kmem_alloc(size, flags); + + if (ptr) + memset((char *)ptr, 0, (int)size); + + return (ptr); +} + +void +kmem_free(void *ptr, size_t size) +{ + if (((unsigned long)ptr < VMALLOC_START) || + ((unsigned long)ptr >= VMALLOC_END)) { + kfree(ptr); + } else { + vfree(ptr); + } +} + +void * +kmem_realloc(void *ptr, size_t newsize, size_t oldsize, int flags) +{ + void *new; + + new = kmem_alloc(newsize, flags); + if (ptr) { + memcpy(new, ptr, ((oldsize < newsize) ? oldsize : newsize)); + kmem_free(ptr, oldsize); + } + + return new; +} + +kmem_zone_t * +kmem_zone_init(int size, char *zone_name) +{ + return kmem_cache_create(zone_name, size, 0, 0, NULL, NULL); +} + +void * +kmem_zone_alloc(kmem_zone_t *zone, int flags) +{ + int shrink = DEF_PRIORITY; /* # times to try to shrink cache */ + void *ptr = NULL; + +repeat: + ptr = kmem_cache_alloc(zone, flag_convert(flags)); + + if (ptr || (flags & KM_NOSLEEP)) + return ptr; + + /* + * KM_SLEEP callers don't expect a failure + */ + if (shrink) { + kmem_shake(); + + shrink--; + goto repeat; + } + + if (flags & KM_SLEEP) + panic("kmem_zone_alloc: NULL memory on KM_SLEEP request!"); + + return NULL; +} + +void * +kmem_zone_zalloc(kmem_zone_t *zone, int flags) +{ + int shrink = DEF_PRIORITY; /* # times to try to shrink cache */ + void *ptr = NULL; + +repeat: + ptr = kmem_cache_zalloc(zone, flag_convert(flags)); + + if (ptr || (flags & KM_NOSLEEP)) + return ptr; + + /* + * KM_SLEEP callers don't expect a failure + */ + if (shrink) { + kmem_shake(); + + shrink--; + goto repeat; + } + + if (flags & KM_SLEEP) + panic("kmem_zone_zalloc: NULL memory on KM_SLEEP request!"); + + return NULL; +} + +void +kmem_zone_free(kmem_zone_t *zone, void *ptr) +{ + kmem_cache_free(zone, ptr); +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/support/kmem.h linux-2.4-xfs/fs/xfs/support/kmem.h --- linux-2.4.19/fs/xfs/support/kmem.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/support/kmem.h Wed Sep 4 22:35:41 2002 @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_KMEM_H__ +#define __XFS_SUPPORT_KMEM_H__ + +#include + +/* + * memory management routines + */ +#define KM_SLEEP 0x0001 +#define KM_NOSLEEP 0x0002 +#define KM_NOFS 0x0004 + +#define kmem_zone kmem_cache_s +#define kmem_zone_t kmem_cache_t + +extern kmem_zone_t *kmem_zone_init(int, char *); +extern void *kmem_zone_zalloc(kmem_zone_t *, int); +extern void *kmem_zone_alloc(kmem_zone_t *, int); +extern void kmem_zone_free(kmem_zone_t *, void *); + +extern void *kmem_alloc(size_t, int); +extern void *kmem_realloc(void *, size_t, size_t, int); +extern void *kmem_zalloc(size_t, int); +extern void kmem_free(void *, size_t); + +typedef void (*kmem_shake_func_t)(void); + +extern void kmem_shake_register(kmem_shake_func_t); +extern void kmem_shake_deregister(kmem_shake_func_t); + +#endif /* __XFS_SUPPORT_KMEM_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/support/ktrace.c linux-2.4-xfs/fs/xfs/support/ktrace.c --- linux-2.4.19/fs/xfs/support/ktrace.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/support/ktrace.c Tue Aug 20 17:36:13 2002 @@ -0,0 +1,373 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include + +#include +#include "kmem.h" +#include "spin.h" +#include "debug.h" +#include "atomic.h" +#include "ktrace.h" + +#if (defined(DEBUG) || defined(CONFIG_XFS_VNODE_TRACING)) + +static kmem_zone_t *ktrace_hdr_zone; +static kmem_zone_t *ktrace_ent_zone; +static int ktrace_zentries; + +void +ktrace_init(int zentries) +{ + ktrace_zentries = zentries; + + ktrace_hdr_zone = kmem_zone_init(sizeof(ktrace_t), + "ktrace_hdr"); + ASSERT(ktrace_hdr_zone); + + ktrace_ent_zone = kmem_zone_init(ktrace_zentries + * sizeof(ktrace_entry_t), + "ktrace_ent"); + ASSERT(ktrace_ent_zone); +} + +void +ktrace_uninit(void) +{ + kmem_cache_destroy(ktrace_hdr_zone); + kmem_cache_destroy(ktrace_ent_zone); +} + +/* + * ktrace_alloc() + * + * Allocate a ktrace header and enough buffering for the given + * number of entries. + */ +ktrace_t * +ktrace_alloc(int nentries, int sleep) +{ + ktrace_t *ktp; + ktrace_entry_t *ktep; + + ktp = (ktrace_t*)kmem_zone_alloc(ktrace_hdr_zone, sleep); + + if (ktp == (ktrace_t*)NULL) { + /* + * KM_SLEEP callers don't expect failure. + */ + if (sleep & KM_SLEEP) + panic("ktrace_alloc: NULL memory on KM_SLEEP request!"); + + return NULL; + } + + /* + * Special treatment for buffers with the ktrace_zentries entries + */ + if (nentries == ktrace_zentries) { + ktep = (ktrace_entry_t*)kmem_zone_zalloc(ktrace_ent_zone, + sleep); + } else { + ktep = (ktrace_entry_t*)kmem_zalloc((nentries * sizeof(*ktep)), + sleep); + } + + if (ktep == NULL) { + /* + * KM_SLEEP callers don't expect failure. + */ + if (sleep & KM_SLEEP) + panic("ktrace_alloc: NULL memory on KM_SLEEP request!"); + + kmem_free(ktp, sizeof(*ktp)); + + return NULL; + } + + spinlock_init(&(ktp->kt_lock), "kt_lock"); + + ktp->kt_entries = ktep; + ktp->kt_nentries = nentries; + ktp->kt_index = 0; + ktp->kt_rollover = 0; + + return ktp; +} + + +/* + * ktrace_free() + * + * Free up the ktrace header and buffer. It is up to the caller + * to ensure that no-one is referencing it. + */ +void +ktrace_free(ktrace_t *ktp) +{ + int entries_size; + + if (ktp == (ktrace_t *)NULL) + return; + + spinlock_destroy(&ktp->kt_lock); + + /* + * Special treatment for the Vnode trace buffer. + */ + if (ktp->kt_nentries == ktrace_zentries) { + kmem_zone_free(ktrace_ent_zone, ktp->kt_entries); + } else { + entries_size = (int)(ktp->kt_nentries * sizeof(ktrace_entry_t)); + + kmem_free(ktp->kt_entries, entries_size); + } + + kmem_zone_free(ktrace_hdr_zone, ktp); +} + + +/* + * Enter the given values into the "next" entry in the trace buffer. + * kt_index is always the index of the next entry to be filled. + */ +void +ktrace_enter( + ktrace_t *ktp, + void *val0, + void *val1, + void *val2, + void *val3, + void *val4, + void *val5, + void *val6, + void *val7, + void *val8, + void *val9, + void *val10, + void *val11, + void *val12, + void *val13, + void *val14, + void *val15) +{ + int index; + ktrace_entry_t *ktep; + + ASSERT(ktp != NULL); + + /* + * Grab an entry by pushing the index up to the next one. + */ + index = atomicIncWithWrap(&ktp->kt_index, ktp->kt_nentries); + + if (!ktp->kt_rollover && index == ktp->kt_nentries - 1) + ktp->kt_rollover = 1; + + ASSERT((index >= 0) && (index < ktp->kt_nentries)); + + ktep = &(ktp->kt_entries[index]); + + ktep->val[0] = val0; + ktep->val[1] = val1; + ktep->val[2] = val2; + ktep->val[3] = val3; + ktep->val[4] = val4; + ktep->val[5] = val5; + ktep->val[6] = val6; + ktep->val[7] = val7; + ktep->val[8] = val8; + ktep->val[9] = val9; + ktep->val[10] = val10; + ktep->val[11] = val11; + ktep->val[12] = val12; + ktep->val[13] = val13; + ktep->val[14] = val14; + ktep->val[15] = val15; +} + +/* + * Return the number of entries in the trace buffer. + */ +int +ktrace_nentries( + ktrace_t *ktp) +{ + if (ktp == NULL) { + return 0; + } + + return (ktp->kt_rollover ? ktp->kt_nentries : ktp->kt_index); +} + + +/* + * ktrace_first() + * + * This is used to find the start of the trace buffer. + * In conjunction with ktrace_next() it can be used to + * iterate through the entire trace buffer. This code does + * not do any locking because it is assumed that it is called + * from the debugger. + * + * The caller must pass in a pointer to a ktrace_snap + * structure in which we will keep some state used to + * iterate through the buffer. This state must not touched + * by any code outside of this module. + */ +ktrace_entry_t * +ktrace_first(ktrace_t *ktp, ktrace_snap_t *ktsp) +{ + ktrace_entry_t *ktep; + int index; + int nentries; + + if (ktp->kt_rollover) + index = ktp->kt_index; + else + index = 0; + + ktsp->ks_start = index; + ktep = &(ktp->kt_entries[index]); + + nentries = ktrace_nentries(ktp); + index++; + if (index < nentries) { + ktsp->ks_index = index; + } else { + ktsp->ks_index = 0; + if (index > nentries) + ktep = NULL; + } + return ktep; +} + + +/* + * ktrace_next() + * + * This is used to iterate through the entries of the given + * trace buffer. The caller must pass in the ktrace_snap_t + * structure initialized by ktrace_first(). The return value + * will be either a pointer to the next ktrace_entry or NULL + * if all of the entries have been traversed. + */ +ktrace_entry_t * +ktrace_next( + ktrace_t *ktp, + ktrace_snap_t *ktsp) +{ + int index; + ktrace_entry_t *ktep; + + index = ktsp->ks_index; + if (index == ktsp->ks_start) { + ktep = NULL; + } else { + ktep = &ktp->kt_entries[index]; + } + + index++; + if (index == ktrace_nentries(ktp)) { + ktsp->ks_index = 0; + } else { + ktsp->ks_index = index; + } + + return ktep; +} + +#if (defined(DEBUG) || defined(CONFIG_XFS_VNODE_TRACING)) +EXPORT_SYMBOL(ktrace_first); +EXPORT_SYMBOL(ktrace_next); +#endif + +/* + * ktrace_skip() + * + * Skip the next "count" entries and return the entry after that. + * Return NULL if this causes us to iterate past the beginning again. + */ + +ktrace_entry_t * +ktrace_skip( + ktrace_t *ktp, + int count, + ktrace_snap_t *ktsp) +{ + int index; + int new_index; + ktrace_entry_t *ktep; + int nentries = ktrace_nentries(ktp); + + index = ktsp->ks_index; + new_index = index + count; + while (new_index >= nentries) { + new_index -= nentries; + } + if (index == ktsp->ks_start) { + /* + * We've iterated around to the start, so we're done. + */ + ktep = NULL; + } else if ((new_index < index) && (index < ktsp->ks_index)) { + /* + * We've skipped past the start again, so we're done. + */ + ktep = NULL; + ktsp->ks_index = ktsp->ks_start; + } else { + ktep = &(ktp->kt_entries[new_index]); + new_index++; + if (new_index == nentries) { + ktsp->ks_index = 0; + } else { + ktsp->ks_index = new_index; + } + } + return ktep; +} + +#else + +ktrace_t * +ktrace_alloc(int nentries, int sleep) +{ + /* + * KM_SLEEP callers don't expect failure. + */ + if (sleep & KM_SLEEP) + panic("ktrace_alloc: NULL memory on KM_SLEEP request!"); + + return NULL; +} +#endif diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/support/ktrace.h linux-2.4-xfs/fs/xfs/support/ktrace.h --- linux-2.4.19/fs/xfs/support/ktrace.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/support/ktrace.h Sat Jul 27 13:08:38 2002 @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_KTRACE_H__ +#define __XFS_SUPPORT_KTRACE_H__ + + +/* + * Trace buffer entry structure. + */ +typedef struct ktrace_entry { + void *val[16]; +} ktrace_entry_t; + +/* + * Trace buffer header structure. + */ +typedef struct ktrace { + lock_t kt_lock; /* mutex to guard counters */ + int kt_nentries; /* number of entries in trace buf */ + int kt_index; /* current index in entries */ + int kt_rollover; + ktrace_entry_t *kt_entries; /* buffer of entries */ +} ktrace_t; + +/* + * Trace buffer snapshot structure. + */ +typedef struct ktrace_snap { + int ks_start; /* kt_index at time of snap */ + int ks_index; /* current index */ +} ktrace_snap_t; + +/* + * Exported interfaces. + */ +extern ktrace_t *ktrace_alloc(int, int); + +#if (defined(DEBUG) || defined(CONFIG_XFS_VNODE_TRACING)) + +extern void ktrace_init(int zentries); +extern void ktrace_uninit(void); + +extern void ktrace_free(ktrace_t *); + +extern void ktrace_enter( + ktrace_t *, + void *, + void *, + void *, + void *, + void *, + void *, + void *, + void *, + void *, + void *, + void *, + void *, + void *, + void *, + void *, + void *); + +extern ktrace_entry_t *ktrace_first(ktrace_t *, ktrace_snap_t *); +extern int ktrace_nentries(ktrace_t *); +extern ktrace_entry_t *ktrace_next(ktrace_t *, ktrace_snap_t *); +extern ktrace_entry_t *ktrace_skip(ktrace_t *, int, ktrace_snap_t *); + +#else + +#define ktrace_free(ktp) +#define ktrace_enter(ktp,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) + +#endif + +#endif /* __XFS_SUPPORT_KTRACE_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/support/move.c linux-2.4-xfs/fs/xfs/support/move.c --- linux-2.4.19/fs/xfs/support/move.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/support/move.c Thu Sep 5 15:35:08 2002 @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include + +#include +#include "debug.h" +#include "move.h" + +/* + * Move "n" bytes at byte address "cp"; "rw" indicates the direction + * of the move, and the I/O parameters are provided in "uio", which is + * update to reflect the data which was moved. Returns 0 on success or + * a non-zero errno on failure. + */ +int +uiomove(void *cp, size_t n, enum uio_rw rw, struct uio *uio) +{ + register struct iovec *iov; + u_int cnt; + int error; + + while (n > 0 && uio->uio_resid) { + iov = uio->uio_iov; + cnt = (u_int)iov->iov_len; + if (cnt == 0) { + uio->uio_iov++; + uio->uio_iovcnt--; + continue; + } + if (cnt > n) + cnt = (u_int)n; + switch (uio->uio_segflg) { + case UIO_USERSPACE: + if (rw == UIO_READ) + error = copy_to_user(iov->iov_base, cp, cnt); + else + error = copy_from_user(cp, iov->iov_base, cnt); + if (error) + return EFAULT; + break; + + + case UIO_SYSSPACE: + if (rw == UIO_READ) + bcopy(cp, iov->iov_base, cnt); + else + bcopy(iov->iov_base, cp, cnt); + break; + + default: + ASSERT(0); + break; + } + iov->iov_base = (void *)((char *)iov->iov_base + cnt); + iov->iov_len -= cnt; + uio->uio_resid -= cnt; + uio->uio_offset += cnt; + cp = (void *)((char *)cp + cnt); + n -= cnt; + } + return 0; +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/support/move.h linux-2.4-xfs/fs/xfs/support/move.h --- linux-2.4.19/fs/xfs/support/move.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/support/move.h Thu Sep 5 15:35:08 2002 @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#ifndef __XFS_SUPPORT_MOVE_H__ +#define __XFS_SUPPORT_MOVE_H__ + +#include +#include + +#define bzero(p,s) memset((p), 0, (s)) +#define bcopy(s,d,n) memcpy((d),(s),(n)) +#define bcmp(s1,s2,l) memcmp(s1,s2,l) +#define ovbcopy(from,to,count) memmove(to,from,count) + +typedef struct iovec iovec_t; + +typedef struct uio { + iovec_t *uio_iov; /* pointer to array of iovecs */ + int uio_iovcnt; /* number of iovecs */ + int uio_fmode; /* file mode flags */ + xfs_off_t uio_offset; /* file offset */ + short uio_segflg; /* address space (kernel or user) */ + ssize_t uio_resid; /* residual count */ +} uio_t; + +/* + * I/O direction. + */ +typedef enum uio_rw { UIO_READ, UIO_WRITE } uio_rw_t; + +/* + * Segment flag values. + */ +typedef enum uio_seg { + UIO_USERSPACE, /* uio_iov describes user space */ + UIO_SYSSPACE, /* uio_iov describes system space */ +} uio_seg_t; + + +extern int uiomove (void *, size_t, uio_rw_t, uio_t *); + +#endif /* __XFS_SUPPORT_MOVE_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/support/mrlock.c linux-2.4-xfs/fs/xfs/support/mrlock.c --- linux-2.4.19/fs/xfs/support/mrlock.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/support/mrlock.c Thu Aug 1 16:56:18 2002 @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include +#include +#include +#include +#include + +#include "mrlock.h" + + +#if USE_RW_WAIT_QUEUE_SPINLOCK +# define wq_write_lock write_lock +#else +# define wq_write_lock spin_lock +#endif + +/* + * We don't seem to need lock_type (only one supported), name, or + * sequence. But, XFS will pass it so let's leave them here for now. + */ +/* ARGSUSED */ +void +mrlock_init(mrlock_t *mrp, int lock_type, char *name, long sequence) +{ + mrp->mr_count = 0; + mrp->mr_reads_waiting = 0; + mrp->mr_writes_waiting = 0; + init_waitqueue_head(&mrp->mr_readerq); + init_waitqueue_head(&mrp->mr_writerq); + mrp->mr_lock = SPIN_LOCK_UNLOCKED; +} + +/* + * Macros to lock/unlock the mrlock_t. + */ + +#define MRLOCK(m) spin_lock(&(m)->mr_lock); +#define MRUNLOCK(m) spin_unlock(&(m)->mr_lock); + + +/* + * lock_wait should never be called in an interrupt thread. + * + * mrlocks can sleep (i.e. call schedule) and so they can't ever + * be called from an interrupt thread. + * + * threads that wake-up should also never be invoked from interrupt threads. + * + * But, waitqueue_lock is locked from interrupt threads - and we are + * called with interrupts disabled, so it is all OK. + */ + +/* ARGSUSED */ +void +lock_wait(wait_queue_head_t *q, spinlock_t *lock, int rw) +{ + DECLARE_WAITQUEUE( wait, current ); + + set_current_state(TASK_UNINTERRUPTIBLE); + + wq_write_lock(&q->lock); + if (rw) { + __add_wait_queue_tail(q, &wait); + } else { + __add_wait_queue(q, &wait); + } + + wq_write_unlock(&q->lock); + spin_unlock(lock); + + schedule(); + + set_current_state(TASK_RUNNING); + + wq_write_lock(&q->lock); + __remove_wait_queue(q, &wait); + wq_write_unlock(&q->lock); + + spin_lock(lock); + + /* return with lock held */ +} + +/* ARGSUSED */ +void +mrfree(mrlock_t *mrp) +{ +} + +/* ARGSUSED */ +void +mrlock(mrlock_t *mrp, int type, int flags) +{ + if (type == MR_ACCESS) + mraccess(mrp); + else + mrupdate(mrp); +} + +/* ARGSUSED */ +void +mraccessf(mrlock_t *mrp, int flags) +{ + MRLOCK(mrp); + if(mrp->mr_writes_waiting > 0) { + mrp->mr_reads_waiting++; + lock_wait(&mrp->mr_readerq, &mrp->mr_lock, 0); + mrp->mr_reads_waiting--; + } + while (mrp->mr_count < 0) { + mrp->mr_reads_waiting++; + lock_wait(&mrp->mr_readerq, &mrp->mr_lock, 0); + mrp->mr_reads_waiting--; + } + mrp->mr_count++; + MRUNLOCK(mrp); +} + +/* ARGSUSED */ +void +mrupdatef(mrlock_t *mrp, int flags) +{ + MRLOCK(mrp); + while(mrp->mr_count) { + mrp->mr_writes_waiting++; + lock_wait(&mrp->mr_writerq, &mrp->mr_lock, 1); + mrp->mr_writes_waiting--; + } + + mrp->mr_count = -1; /* writer on it */ + MRUNLOCK(mrp); +} + +int +mrtryaccess(mrlock_t *mrp) +{ + MRLOCK(mrp); + /* + * If anyone is waiting for update access or the lock is held for update + * fail the request. + */ + if(mrp->mr_writes_waiting > 0 || mrp->mr_count < 0) { + MRUNLOCK(mrp); + return 0; + } + mrp->mr_count++; + MRUNLOCK(mrp); + return 1; +} + +int +mrtrypromote(mrlock_t *mrp) +{ + MRLOCK(mrp); + + if(mrp->mr_count == 1) { /* We are the only thread with the lock */ + mrp->mr_count = -1; /* writer on it */ + MRUNLOCK(mrp); + return 1; + } + + MRUNLOCK(mrp); + return 0; +} + +int +mrtryupdate(mrlock_t *mrp) +{ + MRLOCK(mrp); + + if(mrp->mr_count) { + MRUNLOCK(mrp); + return 0; + } + + mrp->mr_count = -1; /* writer on it */ + MRUNLOCK(mrp); + return 1; +} + +static __inline__ void mrwake(mrlock_t *mrp) +{ + /* + * First, if the count is now 0, we need to wake-up anyone waiting. + */ + if (!mrp->mr_count) { + if (mrp->mr_writes_waiting) { /* Wake-up first writer waiting */ + wake_up(&mrp->mr_writerq); + } else if (mrp->mr_reads_waiting) { /* Wakeup any readers waiting */ + wake_up(&mrp->mr_readerq); + } + } +} + +void +mraccunlock(mrlock_t *mrp) +{ + MRLOCK(mrp); + mrp->mr_count--; + mrwake(mrp); + MRUNLOCK(mrp); +} + +void +mrunlock(mrlock_t *mrp) +{ + MRLOCK(mrp); + if (mrp->mr_count < 0) { + mrp->mr_count = 0; + } else { + mrp->mr_count--; + } + mrwake(mrp); + MRUNLOCK(mrp); +} + +int +ismrlocked(mrlock_t *mrp, int type) /* No need to lock since info can change */ +{ + if (type == MR_ACCESS) + return (mrp->mr_count > 0); /* Read lock */ + else if (type == MR_UPDATE) + return (mrp->mr_count < 0); /* Write lock */ + else if (type == (MR_UPDATE | MR_ACCESS)) + return (mrp->mr_count); /* Any type of lock held */ + else /* Any waiters */ + return (mrp->mr_reads_waiting | mrp->mr_writes_waiting); +} + +/* + * Demote from update to access. We better be the only thread with the + * lock in update mode so it should be easy to set to 1. + * Wake-up any readers waiting. + */ + +void +mrdemote(mrlock_t *mrp) +{ + MRLOCK(mrp); + mrp->mr_count = 1; + if (mrp->mr_reads_waiting) { /* Wakeup all readers waiting */ + wake_up(&mrp->mr_readerq); + } + MRUNLOCK(mrp); +} + +int +mrislocked_access(mrlock_t *mrp) +{ + return(mrp->mr_count > 0); +} + +int +mrislocked_update(mrlock_t *mrp) +{ + return(mrp->mr_count < 0); +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/support/mrlock.h linux-2.4-xfs/fs/xfs/support/mrlock.h --- linux-2.4.19/fs/xfs/support/mrlock.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/support/mrlock.h Mon Sep 2 17:49:27 2002 @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_MRLOCK_H__ +#define __XFS_SUPPORT_MRLOCK_H__ + +#include +#include +#include +#include +#include + +/* + * Implement mrlocks on Linux that work for XFS. + * + * These are sleep locks and not spinlocks. If one wants read/write spinlocks, + * use read_lock, write_lock, ... see spinlock.h. + */ + +typedef struct mrlock_s { + int mr_count; + unsigned short mr_reads_waiting; + unsigned short mr_writes_waiting; + wait_queue_head_t mr_readerq; + wait_queue_head_t mr_writerq; + spinlock_t mr_lock; +} mrlock_t; + +#define MR_ACCESS 1 +#define MR_UPDATE 2 + +#define MRLOCK_BARRIER 0x1 +#define MRLOCK_ALLOW_EQUAL_PRI 0x8 + +/* + * mraccessf/mrupdatef take flags to be passed in while sleeping; + * only PLTWAIT is currently supported. + */ + +extern void mraccessf(mrlock_t *, int); +extern void mrupdatef(mrlock_t *, int); +extern void mrlock(mrlock_t *, int, int); +extern void mrunlock(mrlock_t *); +extern void mraccunlock(mrlock_t *); +extern int mrtryupdate(mrlock_t *); +extern int mrtryaccess(mrlock_t *); +extern int mrtrypromote(mrlock_t *); +extern void mrdemote(mrlock_t *); + +extern int ismrlocked(mrlock_t *, int); +extern void mrlock_init(mrlock_t *, int type, char *name, long sequence); +extern void mrfree(mrlock_t *); + +#define mrinit(mrp, name) mrlock_init(mrp, MRLOCK_BARRIER, name, -1) +#define mraccess(mrp) mraccessf(mrp, 0) /* grab for READ/ACCESS */ +#define mrupdate(mrp) mrupdatef(mrp, 0) /* grab for WRITE/UPDATE */ + +#endif /* __XFS_SUPPORT_MRLOCK_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/support/mutex.h linux-2.4-xfs/fs/xfs/support/mutex.h --- linux-2.4.19/fs/xfs/support/mutex.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/support/mutex.h Sat Jul 27 13:08:38 2002 @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * Portions Copyright (c) 2002 Christoph Hellwig. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_MUTEX_H__ +#define __XFS_SUPPORT_MUTEX_H__ + +#include +#include + +/* + * Map the mutex'es from IRIX to Linux semaphores. + * + * Destroy just simply initializes to -99 which should block all other + * callers. + */ +#define MUTEX_DEFAULT 0x0 +typedef struct semaphore mutex_t; + +#define mutex_init(lock, type, name) sema_init(lock, 1) +#define init_mutex(ptr, type, name, sequence) sema_init(lock, 1) +#define mutex_destroy(lock) sema_init(lock, -99) +#define mutex_lock(lock, num) down(lock) +#define mutex_trylock(lock) (down_trylock(lock) ? 0 : 1) +#define mutex_unlock(lock) up(lock) + +#endif /* __XFS_SUPPORT_MUTEX_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/support/qsort.c linux-2.4-xfs/fs/xfs/support/qsort.c --- linux-2.4.19/fs/xfs/support/qsort.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/support/qsort.c Fri Jul 26 13:51:08 2002 @@ -0,0 +1,243 @@ +/* Copyright (C) 1991, 1992, 1996, 1997, 1999 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Written by Douglas C. Schmidt (schmidt@ics.uci.edu). + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +/* If you consider tuning this algorithm, you should consult first: + Engineering a sort function; Jon Bentley and M. Douglas McIlroy; + Software - Practice and Experience; Vol. 23 (11), 1249-1265, 1993. */ + +#include +#include + +/* Byte-wise swap two items of size SIZE. */ +#define SWAP(a, b, size) \ + do \ + { \ + register size_t __size = (size); \ + register char *__a = (a), *__b = (b); \ + do \ + { \ + char __tmp = *__a; \ + *__a++ = *__b; \ + *__b++ = __tmp; \ + } while (--__size > 0); \ + } while (0) + +/* Discontinue quicksort algorithm when partition gets below this size. + This particular magic number was chosen to work best on a Sun 4/260. */ +#define MAX_THRESH 4 + +/* Stack node declarations used to store unfulfilled partition obligations. */ +typedef struct + { + char *lo; + char *hi; + } stack_node; + +/* The next 4 #defines implement a very fast in-line stack abstraction. */ +/* The stack needs log (total_elements) entries (we could even subtract + log(MAX_THRESH)). Since total_elements has type size_t, we get as + upper bound for log (total_elements): + bits per byte (CHAR_BIT) * sizeof(size_t). */ +#define STACK_SIZE (8 * sizeof(unsigned long int)) +#define PUSH(low, high) ((void) ((top->lo = (low)), (top->hi = (high)), ++top)) +#define POP(low, high) ((void) (--top, (low = top->lo), (high = top->hi))) +#define STACK_NOT_EMPTY (stack < top) + + +/* Order size using quicksort. This implementation incorporates + four optimizations discussed in Sedgewick: + + 1. Non-recursive, using an explicit stack of pointer that store the + next array partition to sort. To save time, this maximum amount + of space required to store an array of SIZE_MAX is allocated on the + stack. Assuming a 32-bit (64 bit) integer for size_t, this needs + only 32 * sizeof(stack_node) == 256 bytes (for 64 bit: 1024 bytes). + Pretty cheap, actually. + + 2. Chose the pivot element using a median-of-three decision tree. + This reduces the probability of selecting a bad pivot value and + eliminates certain extraneous comparisons. + + 3. Only quicksorts TOTAL_ELEMS / MAX_THRESH partitions, leaving + insertion sort to order the MAX_THRESH items within each partition. + This is a big win, since insertion sort is faster for small, mostly + sorted array segments. + + 4. The larger of the two sub-partitions is always pushed onto the + stack first, with the algorithm then concentrating on the + smaller partition. This *guarantees* no more than log (total_elems) + stack size is needed (actually O(1) in this case)! */ + +void +qsort (void *const pbase, size_t total_elems, size_t size, + int (*cmp)(const void *, const void *)) +{ + register char *base_ptr = (char *) pbase; + + const size_t max_thresh = MAX_THRESH * size; + + if (total_elems == 0) + /* Avoid lossage with unsigned arithmetic below. */ + return; + + if (total_elems > MAX_THRESH) + { + char *lo = base_ptr; + char *hi = &lo[size * (total_elems - 1)]; + stack_node stack[STACK_SIZE]; + stack_node *top = stack + 1; + + while (STACK_NOT_EMPTY) + { + char *left_ptr; + char *right_ptr; + + /* Select median value from among LO, MID, and HI. Rearrange + LO and HI so the three values are sorted. This lowers the + probability of picking a pathological pivot value and + skips a comparison for both the LEFT_PTR and RIGHT_PTR in + the while loops. */ + + char *mid = lo + size * ((hi - lo) / size >> 1); + + if ((*cmp) ((void *) mid, (void *) lo) < 0) + SWAP (mid, lo, size); + if ((*cmp) ((void *) hi, (void *) mid) < 0) + SWAP (mid, hi, size); + else + goto jump_over; + if ((*cmp) ((void *) mid, (void *) lo) < 0) + SWAP (mid, lo, size); + jump_over:; + + left_ptr = lo + size; + right_ptr = hi - size; + + /* Here's the famous ``collapse the walls'' section of quicksort. + Gotta like those tight inner loops! They are the main reason + that this algorithm runs much faster than others. */ + do + { + while ((*cmp) ((void *) left_ptr, (void *) mid) < 0) + left_ptr += size; + + while ((*cmp) ((void *) mid, (void *) right_ptr) < 0) + right_ptr -= size; + + if (left_ptr < right_ptr) + { + SWAP (left_ptr, right_ptr, size); + if (mid == left_ptr) + mid = right_ptr; + else if (mid == right_ptr) + mid = left_ptr; + left_ptr += size; + right_ptr -= size; + } + else if (left_ptr == right_ptr) + { + left_ptr += size; + right_ptr -= size; + break; + } + } + while (left_ptr <= right_ptr); + + /* Set up pointers for next iteration. First determine whether + left and right partitions are below the threshold size. If so, + ignore one or both. Otherwise, push the larger partition's + bounds on the stack and continue sorting the smaller one. */ + + if ((size_t) (right_ptr - lo) <= max_thresh) + { + if ((size_t) (hi - left_ptr) <= max_thresh) + /* Ignore both small partitions. */ + POP (lo, hi); + else + /* Ignore small left partition. */ + lo = left_ptr; + } + else if ((size_t) (hi - left_ptr) <= max_thresh) + /* Ignore small right partition. */ + hi = right_ptr; + else if ((right_ptr - lo) > (hi - left_ptr)) + { + /* Push larger left partition indices. */ + PUSH (lo, right_ptr); + lo = left_ptr; + } + else + { + /* Push larger right partition indices. */ + PUSH (left_ptr, hi); + hi = right_ptr; + } + } + } + + /* Once the BASE_PTR array is partially sorted by quicksort the rest + is completely sorted using insertion sort, since this is efficient + for partitions below MAX_THRESH size. BASE_PTR points to the beginning + of the array to sort, and END_PTR points at the very last element in + the array (*not* one beyond it!). */ + { + char *const end_ptr = &base_ptr[size * (total_elems - 1)]; + char *tmp_ptr = base_ptr; + char *thresh = min(end_ptr, base_ptr + max_thresh); + register char *run_ptr; + + /* Find smallest element in first threshold and place it at the + array's beginning. This is the smallest array element, + and the operation speeds up insertion sort's inner loop. */ + + for (run_ptr = tmp_ptr + size; run_ptr <= thresh; run_ptr += size) + if ((*cmp) ((void *) run_ptr, (void *) tmp_ptr) < 0) + tmp_ptr = run_ptr; + + if (tmp_ptr != base_ptr) + SWAP (tmp_ptr, base_ptr, size); + + /* Insertion sort, running from left-hand-side up to right-hand-side. */ + + run_ptr = base_ptr + size; + while ((run_ptr += size) <= end_ptr) + { + tmp_ptr = run_ptr - size; + while ((*cmp) ((void *) run_ptr, (void *) tmp_ptr) < 0) + tmp_ptr -= size; + + tmp_ptr += size; + if (tmp_ptr != run_ptr) + { + char *trav; + + trav = run_ptr + size; + while (--trav >= run_ptr) + { + char c = *trav; + char *hi, *lo; + + for (hi = lo = trav; (lo -= size) >= tmp_ptr; hi = lo) + *hi = *lo; + *hi = c; + } + } + } + } +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/support/qsort.h linux-2.4-xfs/fs/xfs/support/qsort.h --- linux-2.4.19/fs/xfs/support/qsort.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/support/qsort.h Sat Jul 27 13:08:38 2002 @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#ifndef QSORT_H +#define QSORT_H + +extern void qsort (void *const pbase, + size_t total_elems, + size_t size, + int (*cmp)(const void *, const void *)); + +#endif diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/support/rwsem.h linux-2.4-xfs/fs/xfs/support/rwsem.h --- linux-2.4.19/fs/xfs/support/rwsem.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/support/rwsem.h Wed Aug 21 15:56:25 2002 @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_RWSEM_H__ +#define __XFS_SUPPORT_RWSEM_H__ + +#include +#include + +#define MR_ACCESS 1 +#define MR_UPDATE 2 + +/* + * The IRIX mrlocks have been totally replaced by rw_semaphores on Linux. + * + * But the Linux rw_semaphores don't provide any asserts on the lock + * holders. XFS on ther hand makes excessive use of those in debug builds. + * + * To keep this infrastructure on Linux we implement open-coded versions + * of the lock asserts for the two most common rw_semaphore implementations. + * + * There is NO support for debug builds on architectures using the other + * implementations. + */ + +#ifdef DEBUG +#if defined(CONFIG_RWSEM_GENERIC_SPINLOCK) +static inline int rwsem_held(struct rw_semaphore *sem, int type) +{ + switch (type) { + case MR_ACCESS: /* Read lock */ + return (sem->activity > 0); + case MR_UPDATE: /* Write lock */ + return (sem->activity < 0); + case (MR_UPDATE|MR_ACCESS): /* Any type of lock held */ + return (sem->activity != 0); + default: /* Any waiters */ + return (!list_empty(&sem->wait_list)); + } +} +#elif defined(__i386__) +static inline int rwsem_held(struct rw_semaphore *sem, int type) +{ + switch (type) { + case MR_ACCESS: /* Read lock */ + return (sem->count > 0); + case MR_UPDATE: /* Write lock */ + return (sem->count < 0); + case (MR_UPDATE|MR_ACCESS): /* Any type of lock held */ + return (sem->count != 0); + default: /* Any waiters */ + return (!list_empty(&sem->wait_list)); + } +} +#else +#error "Sorry, XFS debug builds are not supported in this configuration" +#endif +#endif /* DEBUG */ + +#endif /* __XFS_SUPPORT_RWSEM_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/support/sema.h linux-2.4-xfs/fs/xfs/support/sema.h --- linux-2.4.19/fs/xfs/support/sema.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/support/sema.h Sat Jul 27 13:08:38 2002 @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_SEMA_H__ +#define __XFS_SUPPORT_SEMA_H__ + +#include +#include +#include +#include +#include + +/* + * sema_t structure just maps to struct semaphore in Linux kernel. + */ + +typedef struct semaphore sema_t; + +#define init_sema(sp, val, c, d) sema_init(sp, val) +#define initsema(sp, val) sema_init(sp, val) +#define initnsema(sp, val, name) sema_init(sp, val) +#define psema(sp, b) down(sp) +#define vsema(sp) up(sp) +#define valusema(sp) (atomic_read(&(sp)->count)) +#define freesema(sema) + +/* + * Map cpsema (try to get the sema) to down_trylock. We need to switch + * the return values since cpsema returns 1 (acquired) 0 (failed) and + * down_trylock returns the reverse 0 (acquired) 1 (failed). + */ + +#define cpsema(sp) (down_trylock(sp) ? 0 : 1) + +/* + * Didn't do cvsema(sp). Not sure how to map this to up/down/... + * It does a vsema if the values is < 0 other wise nothing. + */ + +#endif /* __XFS_SUPPORT_SEMA_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/support/spin.h linux-2.4-xfs/fs/xfs/support/spin.h --- linux-2.4.19/fs/xfs/support/spin.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/support/spin.h Wed Sep 4 22:35:42 2002 @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * Portions Copyright (c) 2002 Christoph Hellwig. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_SPIN_H__ +#define __XFS_SUPPORT_SPIN_H__ + +#include /* preempt needs this */ +#include + +/* + * Map lock_t from IRIX to Linux spinlocks. + * + * Note that linux turns on/off spinlocks depending on CONFIG_SMP. + * We don't need to worry about SMP or not here. + */ + +typedef spinlock_t lock_t; + +#define spinlock_init(lock, name) spin_lock_init(lock) +#define init_spinlock(lock, name, ll) spin_lock_init(lock) +#define spinlock_destroy(lock) + +static inline unsigned long mutex_spinlock(lock_t *lock) +{ + spin_lock(lock); + return 0; +} + +/*ARGSUSED*/ +static inline void mutex_spinunlock(lock_t *lock, unsigned long s) +{ + spin_unlock(lock); +} + +static inline void nested_spinlock(lock_t *lock) +{ + spin_lock(lock); +} + +static inline void nested_spinunlock(lock_t *lock) +{ + spin_unlock(lock); +} + +#endif /* __XFS_SUPPORT_SPIN_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/support/sv.h linux-2.4-xfs/fs/xfs/support/sv.h --- linux-2.4.19/fs/xfs/support/sv.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/support/sv.h Sat Jul 27 13:08:38 2002 @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * Portions Copyright (c) 2002 Christoph Hellwig. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_SV_H__ +#define __XFS_SUPPORT_SV_H__ + +#include +#include +#include + +/* + * Synchronisation variables. + * + * (Parameters "pri", "svf" and "rts" are not implemented) + */ + +typedef struct sv_s { + wait_queue_head_t waiters; +} sv_t; + +#define SV_FIFO 0x0 /* sv_t is FIFO type */ +#define SV_LIFO 0x2 /* sv_t is LIFO type */ +#define SV_PRIO 0x4 /* sv_t is PRIO type */ +#define SV_KEYED 0x6 /* sv_t is KEYED type */ +#define SV_DEFAULT SV_FIFO + + +static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state, + unsigned long timeout) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(&sv->waiters, &wait); + set_current_state(state); + spin_unlock(lock); + + schedule_timeout(timeout); + + set_current_state(TASK_RUNNING); + remove_wait_queue(&sv->waiters, &wait); +} + +#define init_sv(sv,type,name,flag) \ + init_waitqueue_head(&(sv)->waiters) +#define sv_init(sv,flag,name) \ + init_waitqueue_head(&(sv)->waiters) +#define sv_destroy(sv) \ + /*NOTHING*/ +#define sv_wait(sv, pri, lock, s) \ + _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT) +#define sv_wait_sig(sv, pri, lock, s) \ + _sv_wait(sv, lock, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT) +#define sv_timedwait(sv, pri, lock, s, svf, ts, rts) \ + _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, timespec_to_jiffies(ts)) +#define sv_timedwait_sig(sv, pri, lock, s, svf, ts, rts) \ + _sv_wait(sv, lock, TASK_INTERRUPTIBLE, timespec_to_jiffies(ts)) +#define sv_signal(sv) \ + wake_up(&(sv)->waiters) +#define sv_broadcast(sv) \ + wake_up_all(&(sv)->waiters) + +#endif /* __XFS_SUPPORT_SV_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/support/time.h linux-2.4-xfs/fs/xfs/support/time.h --- linux-2.4.19/fs/xfs/support/time.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/support/time.h Wed Sep 4 22:35:41 2002 @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_TIME_H__ +#define __XFS_SUPPORT_TIME_H__ + +#include + +static inline void delay(long ticks) +{ + current->state = TASK_UNINTERRUPTIBLE; + schedule_timeout(ticks); +} + +static inline void nanotime(struct timespec *tvp) +{ + tvp->tv_sec = xtime.tv_sec; + tvp->tv_nsec = xtime.tv_usec * 1000; +} + +#endif /* __XFS_SUPPORT_TIME_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/support/uuid.c linux-2.4-xfs/fs/xfs/support/uuid.c --- linux-2.4.19/fs/xfs/support/uuid.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/support/uuid.c Wed Jul 31 22:23:56 2002 @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include +#include +#include + +#ifdef __sparc__ +#include +#else +#include +#endif + +#include +#include +#include "time.h" +#include "move.h" +#include "uuid.h" + +#ifndef CONFIG_NET +#define dev_get_by_name(x) (NULL) +#define dev_put(x) do { } while (0) +#endif + +/* NODE_SIZE is the number of bytes used for the node identifier portion. */ +#define NODE_SIZE 6 + +/* + * Total size must be 128 bits. N.B. definition of uuid_t in uuid.h! + */ +typedef struct { + u_int32_t uu_timelow; /* time "low" */ + u_int16_t uu_timemid; /* time "mid" */ + u_int16_t uu_timehi; /* time "hi" and version */ + u_int16_t uu_clockseq; /* "reserved" and clock sequence */ + u_int16_t uu_node[NODE_SIZE / 2]; /* ethernet hardware address */ +} uu_t; + +/* + * The Time Base Correction is the amount to add on to a UNIX-based + * time value (i.e. seconds since 1 Jan. 1970) to convert it to the + * time base for UUIDs (15 Oct. 1582). + */ +#define UUID_TBC 0x01B21DD2138140LL + +static short uuid_eaddr[NODE_SIZE / 2]; /* ethernet address */ +static __int64_t uuid_time; /* last time basis used */ +static u_int16_t uuid_clockseq; /* boot-time randomizer */ +DECLARE_MUTEX(uuid_lock); + +/* + * uuid_init - called from out of init_tbl[] + */ +void +uuid_init(void) +{ +} + +/* + * uuid_getnodeuniq - obtain the node unique fields of a UUID. + * + * This is not in any way a standard or condoned UUID function; + * it just something that's needed for user-level file handles. + */ +void +uuid_getnodeuniq(uuid_t *uuid, int fsid [2]) +{ + char *uu=(char*)uuid; + + /* on IRIX, this function assumes big-endian fields within + * the uuid, so we use INT_GET to get the same result on + * little-endian systems + */ + + fsid[0] = (INT_GET(*(u_int16_t*)(uu+8), ARCH_CONVERT) << 16) + + INT_GET(*(u_int16_t*)(uu+4), ARCH_CONVERT); + fsid[1] = INT_GET(*(u_int32_t*)(uu ), ARCH_CONVERT); +} + +void +uuid_create_nil(uuid_t *uuid) +{ + bzero(uuid, sizeof *uuid); +} + +int +uuid_is_nil(uuid_t *uuid) +{ + int i; + char *cp = (char *)uuid; + + if (uuid == NULL) + return B_TRUE; + /* implied check of version number here... */ + for (i = 0; i < sizeof *uuid; i++) + if (*cp++) return B_FALSE; /* not nil */ + return B_TRUE; /* is nil */ +} + +int +uuid_equal(uuid_t *uuid1, uuid_t *uuid2) +{ + return bcmp(uuid1, uuid2, sizeof(uuid_t)) ? B_FALSE : B_TRUE; +} + +/* + * Given a 128-bit uuid, return a 64-bit value by adding the top and bottom + * 64-bit words. NOTE: This function can not be changed EVER. Although + * brain-dead, some applications depend on this 64-bit value remaining + * persistent. Specifically, DMI vendors store the value as a persistent + * filehandle. + */ +__uint64_t +uuid_hash64(uuid_t *uuid) +{ + __uint64_t *sp = (__uint64_t *)uuid; + + return sp[0] + sp[1]; +} /* uuid_hash64 */ + +static void +get_eaddr(char *junk) +{ +#ifdef __sparc__ + memcpy(uuid_eaddr, idprom->id_ethaddr, 6); +#else + struct net_device *dev; + + dev = dev_get_by_name("eth0"); + if (!dev || !dev->addr_len) { + get_random_bytes(uuid_eaddr, sizeof(uuid_eaddr)); + } else { + memcpy(uuid_eaddr, dev->dev_addr, + dev->addr_lenaddr_len:sizeof(uuid_eaddr)); + dev_put(dev); + } +#endif +} + +/* + * uuid_create - kernel version, does the actual work + */ +void +uuid_create(uuid_t *uuid) +{ + int i; + uu_t *uu = (uu_t *)uuid; + static int uuid_have_eaddr = 0; /* ethernet addr inited? */ + static int uuid_is_init = 0; /* time/clockseq inited? */ + + down(&uuid_lock); + if (!uuid_is_init) { + timespec_t ts; + + nanotime(&ts); + /* + * The clock sequence must be initialized randomly. + */ + uuid_clockseq = ((unsigned long)jiffies & 0xfff) | 0x8000; + /* + * Initialize the uuid time, it's in 100 nanosecond + * units since a time base in 1582. + */ + uuid_time = ts.tv_sec * 10000000LL + + ts.tv_nsec / 100LL + + UUID_TBC; + uuid_is_init = 1; + } + if (!uuid_have_eaddr) { + uuid_have_eaddr = 1; + get_eaddr((char *)uuid_eaddr); + } + uuid_time++; + uu->uu_timelow = (u_int32_t)(uuid_time & 0x00000000ffffffffLL); + uu->uu_timemid = (u_int16_t)((uuid_time >> 32) & 0x0000ffff); + uu->uu_timehi = (u_int16_t)((uuid_time >> 48) & 0x00000fff) | 0x1000; + up(&uuid_lock); + uu->uu_clockseq = uuid_clockseq; + for (i = 0; i < (NODE_SIZE / 2); i++) + uu->uu_node [i] = uuid_eaddr [i]; +} + +int +uuid_compare(uuid_t *uuid1, uuid_t *uuid2) +{ + int i; + char *cp1 = (char *) uuid1; + char *cp2 = (char *) uuid2; + + if (uuid1 == NULL) { + if (uuid2 == NULL) { + return 0; /* equal because both are nil */ + } else { + return -1; /* uuid1 nil, so precedes uuid2 */ + } + } else if (uuid2 == NULL) { + return 1; + } + + /* implied check of version number here... */ + for (i = 0; i < sizeof(uuid_t); i++) { + if (*cp1 < *cp2) + return -1; + if (*cp1++ > *cp2++) + return 1; + } + return 0; /* they're equal */ +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/support/uuid.h linux-2.4-xfs/fs/xfs/support/uuid.h --- linux-2.4.19/fs/xfs/support/uuid.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/support/uuid.h Sat Jul 27 13:08:38 2002 @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_UUID_H__ +#define __XFS_SUPPORT_UUID_H__ + +void uuid_create_nil(uuid_t *uuid); +int uuid_is_nil(uuid_t *uuid); +int uuid_equal(uuid_t *uuid1, uuid_t *uuid2); +void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]); +__uint64_t uuid_hash64(uuid_t *uuid); + +#endif /* __XFS_SUPPORT_UUID_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs.h linux-2.4-xfs/fs/xfs/xfs.h --- linux-2.4.19/fs/xfs/xfs.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs.h Wed Sep 4 23:34:52 2002 @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_H__ +#define __XFS_H__ + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#endif /* __XFS_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_acl.c linux-2.4-xfs/fs/xfs/xfs_acl.c --- linux-2.4.19/fs/xfs/xfs_acl.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_acl.c Thu Aug 1 01:28:15 2002 @@ -0,0 +1,893 @@ +/* + * Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include +#include + +STATIC int xfs_acl_setmode(vnode_t *, xfs_acl_t *); +STATIC void xfs_acl_filter_mode(mode_t, xfs_acl_t *); +STATIC void xfs_acl_get_endian(xfs_acl_t *); +STATIC int xfs_acl_access(uid_t, gid_t, xfs_acl_t *, mode_t, cred_t *); +STATIC int xfs_acl_invalid(xfs_acl_t *); +STATIC void xfs_acl_sync_mode(mode_t, xfs_acl_t *); +STATIC void xfs_acl_get_attr(vnode_t *, xfs_acl_t *, int, int, int *); +STATIC void xfs_acl_set_attr(vnode_t *, xfs_acl_t *, int, int *); +STATIC int xfs_acl_allow_set(vnode_t *, int); + +kmem_zone_t *xfs_acl_zone; + + +/* + * Test for existence of access ACL attribute as efficiently as possible. + */ +int +xfs_acl_vhasacl_access( + vnode_t *vp) +{ + int error; + + xfs_acl_get_attr(vp, NULL, _ACL_TYPE_ACCESS, ATTR_KERNOVAL, &error); + return (error == 0); +} + +/* + * Test for existence of default ACL attribute as efficiently as possible. + */ +int +xfs_acl_vhasacl_default( + vnode_t *vp) +{ + int error; + + if (vp->v_type != VDIR) + return 0; + xfs_acl_get_attr(vp, NULL, _ACL_TYPE_DEFAULT, ATTR_KERNOVAL, &error); + return (error == 0); +} + +/* + * Convert from extended attribute representation to in-memory for XFS. + */ +STATIC int +posix_acl_xattr_to_xfs( + posix_acl_xattr_header *src, + size_t size, + xfs_acl_t *dest) +{ + posix_acl_xattr_entry *src_entry; + xfs_acl_entry_t *dest_entry; + int n; + + if (!src || !dest) + return EINVAL; + + if (size < sizeof(posix_acl_xattr_header)) + return EINVAL; + + if (src->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + return EINVAL; + + memset(dest, 0, sizeof(xfs_acl_t)); + dest->acl_cnt = posix_acl_xattr_count(size); + if (dest->acl_cnt < 0 || dest->acl_cnt > XFS_ACL_MAX_ENTRIES) + return EINVAL; + + /* + * acl_set_file(3) may request that we set default ACLs with + * zero length -- defend (gracefully) against that here. + */ + if (!dest->acl_cnt) + return 0; + + src_entry = (posix_acl_xattr_entry *)((char *)src + sizeof(*src)); + dest_entry = &dest->acl_entry[0]; + + for (n = 0; n < dest->acl_cnt; n++, src_entry++, dest_entry++) { + dest_entry->ae_perm = le16_to_cpu(src_entry->e_perm); + if (_ACL_PERM_INVALID(dest_entry->ae_perm)) + return EINVAL; + dest_entry->ae_tag = le16_to_cpu(src_entry->e_tag); + switch(dest_entry->ae_tag) { + case ACL_USER: + case ACL_GROUP: + dest_entry->ae_id = le32_to_cpu(src_entry->e_id); + break; + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + dest_entry->ae_id = ACL_UNDEFINED_ID; + break; + default: + return EINVAL; + } + } + if (xfs_acl_invalid(dest)) + return EINVAL; + + return 0; +} + +/* + * Comparison function called from qsort(). + * Primary key is ae_tag, secondary key is ae_id. + */ +STATIC int +xfs_acl_entry_compare( + const void *va, + const void *vb) +{ + xfs_acl_entry_t *a = (xfs_acl_entry_t *)va, + *b = (xfs_acl_entry_t *)vb; + + if (a->ae_tag == b->ae_tag) + return (a->ae_id - b->ae_id); + return (a->ae_tag - b->ae_tag); +} + +/* + * Convert from in-memory XFS to extended attribute representation. + */ +STATIC int +posix_acl_xfs_to_xattr( + xfs_acl_t *src, + posix_acl_xattr_header *dest, + size_t size) +{ + int n; + size_t new_size = posix_acl_xattr_size(src->acl_cnt); + posix_acl_xattr_entry *dest_entry; + xfs_acl_entry_t *src_entry; + + if (size < new_size) + return -ERANGE; + + /* Need to sort src XFS ACL by */ + qsort(src->acl_entry, src->acl_cnt, sizeof(src->acl_entry[0]), + xfs_acl_entry_compare); + + dest->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); + dest_entry = &dest->a_entries[0]; + src_entry = &src->acl_entry[0]; + for (n = 0; n < src->acl_cnt; n++, dest_entry++, src_entry++) { + dest_entry->e_perm = cpu_to_le16(src_entry->ae_perm); + if (_ACL_PERM_INVALID(src_entry->ae_perm)) + return -EINVAL; + dest_entry->e_tag = cpu_to_le16(src_entry->ae_tag); + switch (src_entry->ae_tag) { + case ACL_USER: + case ACL_GROUP: + dest_entry->e_id = cpu_to_le32(src_entry->ae_id); + break; + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + dest_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); + break; + default: + return -EINVAL; + } + } + return new_size; +} + +int +xfs_acl_vget( + vnode_t *vp, + void *acl, + size_t size, + int kind) +{ + int error; + xfs_acl_t *xfs_acl; + posix_acl_xattr_header *ext_acl = acl; + + VN_HOLD(vp); + if ((error = _MAC_VACCESS(vp, NULL, VREAD))) + goto out; + if (!(_ACL_ALLOC(xfs_acl))) { + error = ENOMEM; + goto out; + } + + memset(xfs_acl, 0, sizeof(xfs_acl_t)); + xfs_acl_get_attr(vp, xfs_acl, kind, size? 0 : ATTR_KERNOVAL, &error); + if (error) + goto out; + + if (!size) { + error = -posix_acl_xattr_size(XFS_ACL_MAX_ENTRIES); + } else { + if (xfs_acl_invalid(xfs_acl)) { + error = EINVAL; + goto out; + } + if (kind == _ACL_TYPE_ACCESS) { + vattr_t va; + + va.va_mask = AT_MODE; + VOP_GETATTR(vp, &va, 0, sys_cred, error); + if (error) + goto out; + xfs_acl_sync_mode(va.va_mode, xfs_acl); + } + error = -posix_acl_xfs_to_xattr(xfs_acl, ext_acl, size); + } +out: + VN_RELE(vp); + _ACL_FREE(xfs_acl); + return -error; +} + +int +xfs_acl_vremove( + vnode_t *vp, + int kind) +{ + int error; + + VN_HOLD(vp); + error = xfs_acl_allow_set(vp, kind); + if (!error) { + VOP_ATTR_REMOVE(vp, kind == _ACL_TYPE_DEFAULT? + SGI_ACL_DEFAULT: SGI_ACL_FILE, + ATTR_ROOT, sys_cred, error); + if (error == ENOATTR) + error = 0; /* 'scool */ + } + VN_RELE(vp); + return -error; +} + +int +xfs_acl_vset( + vnode_t *vp, + void *acl, + size_t size, + int kind) +{ + posix_acl_xattr_header *ext_acl = acl; + xfs_acl_t *xfs_acl; + int error; + + if (!acl) + return -EINVAL; + + if (!(_ACL_ALLOC(xfs_acl))) + return -ENOMEM; + + error = posix_acl_xattr_to_xfs(ext_acl, size, xfs_acl); + if (error) { + _ACL_FREE(xfs_acl); + return -error; + } + if (!xfs_acl->acl_cnt) { + _ACL_FREE(xfs_acl); + return 0; + } + + VN_HOLD(vp); + error = xfs_acl_allow_set(vp, kind); + if (error) + goto out; + + /* Incoming ACL exists, set file mode based on its value */ + if (kind == _ACL_TYPE_ACCESS) + xfs_acl_setmode(vp, xfs_acl); + xfs_acl_set_attr(vp, xfs_acl, kind, &error); +out: + VN_RELE(vp); + _ACL_FREE(xfs_acl); + return -error; +} + +int +xfs_acl_iaccess( + xfs_inode_t *ip, + mode_t mode, + cred_t *cr) +{ + xfs_acl_t *acl; + int error; + + if (!(_ACL_ALLOC(acl))) + return -1; + + /* If the file has no ACL return -1. */ + if (xfs_attr_fetch(ip, SGI_ACL_FILE, (char *)acl, sizeof(xfs_acl_t))) { + _ACL_FREE(acl); + return -1; + } + xfs_acl_get_endian(acl); + + /* If the file has an empty ACL return -1. */ + if (acl->acl_cnt == XFS_ACL_NOT_PRESENT) { + _ACL_FREE(acl); + return -1; + } + + /* Synchronize ACL with mode bits */ + xfs_acl_sync_mode(ip->i_d.di_mode, acl); + + error = xfs_acl_access(ip->i_d.di_uid, ip->i_d.di_gid, acl, mode, cr); + _ACL_FREE(acl); + return error; +} + +STATIC int +xfs_acl_allow_set( + vnode_t *vp, + int kind) +{ + vattr_t va; + int error; + + if (kind == _ACL_TYPE_DEFAULT && vp->v_type != VDIR) + return ENOTDIR; + if (vp->v_vfsp->vfs_flag & VFS_RDONLY) + return EROFS; + if ((error = _MAC_VACCESS(vp, NULL, VWRITE))) + return error; + va.va_mask = AT_UID; + VOP_GETATTR(vp, &va, 0, NULL, error); + if (error) + return error; + if (va.va_uid != current->fsuid && !capable(CAP_FOWNER)) + return EPERM; + return error; +} + +/* + * The access control process to determine the access permission: + * if uid == file owner id, use the file owner bits. + * if gid == file owner group id, use the file group bits. + * scan ACL for a maching user or group, and use matched entry + * permission. Use total permissions of all matching group entries, + * until all acl entries are exhausted. The final permission produced + * by matching acl entry or entries needs to be & with group permission. + * if not owner, owning group, or matching entry in ACL, use file + * other bits. + */ +STATIC int +xfs_acl_capability_check( + mode_t mode, + cred_t *cr) +{ + if ((mode & ACL_READ) && !capable_cred(cr, CAP_DAC_READ_SEARCH)) + return EACCES; + if ((mode & ACL_WRITE) && !capable_cred(cr, CAP_DAC_OVERRIDE)) + return EACCES; + if ((mode & ACL_EXECUTE) && !capable_cred(cr, CAP_DAC_OVERRIDE)) + return EACCES; + return 0; +} + +/* + * Note: cr is only used here for the capability check if the ACL test fails. + * It is not used to find out the credentials uid or groups etc, as was + * done in IRIX. It is assumed that the uid and groups for the current + * thread are taken from "current" instead of the cr parameter. + */ +STATIC int +xfs_acl_access( + uid_t fuid, + gid_t fgid, + xfs_acl_t *fap, + mode_t md, + cred_t *cr) +{ + xfs_acl_entry_t matched; + int i, allows; + int maskallows = -1; /* true, but not 1, either */ + int seen_userobj = 0; + + matched.ae_tag = 0; /* Invalid type */ + md >>= 6; /* Normalize the bits for comparison */ + + for (i = 0; i < fap->acl_cnt; i++) { + /* + * Break out if we've got a user_obj entry or + * a user entry and the mask (and have processed USER_OBJ) + */ + if (matched.ae_tag == ACL_USER_OBJ) + break; + if (matched.ae_tag == ACL_USER) { + if (maskallows != -1 && seen_userobj) + break; + if (fap->acl_entry[i].ae_tag != ACL_MASK && + fap->acl_entry[i].ae_tag != ACL_USER_OBJ) + continue; + } + /* True if this entry allows the requested access */ + allows = ((fap->acl_entry[i].ae_perm & md) == md); + + switch (fap->acl_entry[i].ae_tag) { + case ACL_USER_OBJ: + seen_userobj = 1; + if (fuid != current->fsuid) + continue; + matched.ae_tag = ACL_USER_OBJ; + matched.ae_perm = allows; + break; + case ACL_USER: + if (fap->acl_entry[i].ae_id != current->fsuid) + continue; + matched.ae_tag = ACL_USER; + matched.ae_perm = allows; + break; + case ACL_GROUP_OBJ: + if ((matched.ae_tag == ACL_GROUP_OBJ || + matched.ae_tag == ACL_GROUP) && !allows) + continue; + if (!in_group_p(fgid)) + continue; + matched.ae_tag = ACL_GROUP_OBJ; + matched.ae_perm = allows; + break; + case ACL_GROUP: + if ((matched.ae_tag == ACL_GROUP_OBJ || + matched.ae_tag == ACL_GROUP) && !allows) + continue; + if (!in_group_p(fap->acl_entry[i].ae_id)) + continue; + matched.ae_tag = ACL_GROUP; + matched.ae_perm = allows; + break; + case ACL_MASK: + maskallows = allows; + break; + case ACL_OTHER: + if (matched.ae_tag != 0) + continue; + matched.ae_tag = ACL_OTHER; + matched.ae_perm = allows; + break; + } + } + /* + * First possibility is that no matched entry allows access. + * The capability to override DAC may exist, so check for it. + */ + switch (matched.ae_tag) { + case ACL_OTHER: + case ACL_USER_OBJ: + if (matched.ae_perm) + return 0; + break; + case ACL_USER: + case ACL_GROUP_OBJ: + case ACL_GROUP: + if (maskallows && matched.ae_perm) + return 0; + break; + case 0: + break; + } + return xfs_acl_capability_check(md, cr); +} + +/* + * ACL validity checker. + * This acl validation routine checks each ACL entry read in makes sense. + */ +STATIC int +xfs_acl_invalid( + xfs_acl_t *aclp) +{ + xfs_acl_entry_t *entry, *e; + int user = 0, group = 0, other = 0, mask = 0; + int mask_required = 0; + int i, j; + + if (!aclp) + goto acl_invalid; + + if (aclp->acl_cnt > XFS_ACL_MAX_ENTRIES) + goto acl_invalid; + + for (i = 0; i < aclp->acl_cnt; i++) { + entry = &aclp->acl_entry[i]; + switch (entry->ae_tag) { + case ACL_USER_OBJ: + if (user++) + goto acl_invalid; + break; + case ACL_GROUP_OBJ: + if (group++) + goto acl_invalid; + break; + case ACL_OTHER: + if (other++) + goto acl_invalid; + break; + case ACL_USER: + case ACL_GROUP: + for (j = i + 1; j < aclp->acl_cnt; j++) { + e = &aclp->acl_entry[j]; + if (e->ae_id == entry->ae_id && + e->ae_tag == entry->ae_tag) + goto acl_invalid; + } + mask_required++; + break; + case ACL_MASK: + if (mask++) + goto acl_invalid; + break; + default: + goto acl_invalid; + } + } + if (!user || !group || !other || (mask_required && !mask)) + goto acl_invalid; + else + return 0; +acl_invalid: + return EINVAL; +} + +/* + * Do ACL endian conversion. + */ +STATIC void +xfs_acl_get_endian( + xfs_acl_t *aclp) +{ + xfs_acl_entry_t *ace, *end; + + INT_SET(aclp->acl_cnt, ARCH_CONVERT, aclp->acl_cnt); + end = &aclp->acl_entry[0]+aclp->acl_cnt; + for (ace = &aclp->acl_entry[0]; ace < end; ace++) { + INT_SET(ace->ae_tag, ARCH_CONVERT, ace->ae_tag); + INT_SET(ace->ae_id, ARCH_CONVERT, ace->ae_id); + INT_SET(ace->ae_perm, ARCH_CONVERT, ace->ae_perm); + } +} + +/* + * Get the ACL from the EA and do endian conversion. + */ +STATIC void +xfs_acl_get_attr( + vnode_t *vp, + xfs_acl_t *aclp, + int kind, + int flags, + int *error) +{ + int len = sizeof(xfs_acl_t); + + flags |= ATTR_ROOT; + VOP_ATTR_GET(vp, + kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE : SGI_ACL_DEFAULT, + (char *)aclp, &len, flags, sys_cred, *error); + if (*error || (flags & ATTR_KERNOVAL)) + return; + xfs_acl_get_endian(aclp); +} + +/* + * Set the EA with the ACL and do endian conversion. + */ +STATIC void +xfs_acl_set_attr( + vnode_t *vp, + xfs_acl_t *aclp, + int kind, + int *error) +{ + xfs_acl_entry_t *ace, *newace, *end; + xfs_acl_t *newacl; + int len; + + if (!(_ACL_ALLOC(newacl))) { + *error = ENOMEM; + return; + } + + len = sizeof(xfs_acl_t) - + (sizeof(xfs_acl_entry_t) * (XFS_ACL_MAX_ENTRIES - aclp->acl_cnt)); + end = &aclp->acl_entry[0]+aclp->acl_cnt; + for (ace = &aclp->acl_entry[0], newace = &newacl->acl_entry[0]; + ace < end; + ace++, newace++) { + INT_SET(newace->ae_tag, ARCH_CONVERT, ace->ae_tag); + INT_SET(newace->ae_id, ARCH_CONVERT, ace->ae_id); + INT_SET(newace->ae_perm, ARCH_CONVERT, ace->ae_perm); + } + INT_SET(newacl->acl_cnt, ARCH_CONVERT, aclp->acl_cnt); + VOP_ATTR_SET(vp, + kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE: SGI_ACL_DEFAULT, + (char *)newacl, len, ATTR_ROOT, sys_cred, *error); + _ACL_FREE(newacl); +} + +int +xfs_acl_vtoacl( + vnode_t *vp, + xfs_acl_t *access_acl, + xfs_acl_t *default_acl) +{ + vattr_t va; + int error = 0; + + if (access_acl) { + /* + * Get the Access ACL and the mode. If either cannot + * be obtained for some reason, invalidate the access ACL. + */ + xfs_acl_get_attr(vp, access_acl, _ACL_TYPE_ACCESS, 0, &error); + if (!error) { + /* Got the ACL, need the mode... */ + va.va_mask = AT_MODE; + VOP_GETATTR(vp, &va, 0, sys_cred, error); + } + + if (error) + access_acl->acl_cnt = XFS_ACL_NOT_PRESENT; + else /* We have a good ACL and the file mode, synchronize. */ + xfs_acl_sync_mode(va.va_mode, access_acl); + } + + if (default_acl) { + xfs_acl_get_attr(vp, default_acl, _ACL_TYPE_DEFAULT, 0, &error); + if (error) + default_acl->acl_cnt = XFS_ACL_NOT_PRESENT; + } + return error; +} + +/* + * This function retrieves the parent directory's acl, processes it + * and lets the child inherit the acl(s) that it should. + */ +int +xfs_acl_inherit( + vnode_t *vp, + vattr_t *vap, + xfs_acl_t *pdaclp) +{ + xfs_acl_t *cacl; + int error = 0; + + /* + * If the parent does not have a default ACL, or it's an + * invalid ACL, we're done. + */ + if (!vp) + return 0; + if (!pdaclp || xfs_acl_invalid(pdaclp)) + return 0; + + /* + * Copy the default ACL of the containing directory to + * the access ACL of the new file and use the mode that + * was passed in to set up the correct initial values for + * the u::,g::[m::], and o:: entries. This is what makes + * umask() "work" with ACL's. + */ + + if (!(_ACL_ALLOC(cacl))) + return ENOMEM; + + memcpy(cacl, pdaclp, sizeof(xfs_acl_t)); + xfs_acl_filter_mode(vap->va_mode, cacl); + xfs_acl_setmode(vp, cacl); + + /* + * Set the Default and Access ACL on the file. The mode is already + * set on the file, so we don't need to worry about that. + * + * If the new file is a directory, its default ACL is a copy of + * the containing directory's default ACL. + */ + if (vp->v_type == VDIR) + xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error); + if (!error) + xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error); + _ACL_FREE(cacl); + return error; +} + +/* + * Set up the correct mode on the file based on the supplied ACL. This + * makes sure that the mode on the file reflects the state of the + * u::,g::[m::], and o:: entries in the ACL. Since the mode is where + * the ACL is going to get the permissions for these entries, we must + * synchronize the mode whenever we set the ACL on a file. + */ +STATIC int +xfs_acl_setmode( + vnode_t *vp, + xfs_acl_t *acl) +{ + vattr_t va; + xfs_acl_entry_t *ap; + xfs_acl_entry_t *gap = NULL; + int i, error, nomask = 1; + + if (acl->acl_cnt == XFS_ACL_NOT_PRESENT) + return 0; + + /* + * Copy the u::, g::, o::, and m:: bits from the ACL into the + * mode. The m:: bits take precedence over the g:: bits. + */ + va.va_mask = AT_MODE; + VOP_GETATTR(vp, &va, 0, sys_cred, error); + if (error) + return error; + + va.va_mask = AT_MODE; + va.va_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO); + ap = acl->acl_entry; + for (i = 0; i < acl->acl_cnt; ++i) { + switch (ap->ae_tag) { + case ACL_USER_OBJ: + va.va_mode |= ap->ae_perm << 6; + break; + case ACL_GROUP_OBJ: + gap = ap; + break; + case ACL_MASK: + nomask = 0; + va.va_mode |= ap->ae_perm << 3; + break; + case ACL_OTHER: + va.va_mode |= ap->ae_perm; + break; + default: + break; + } + ap++; + } + + /* Set the group bits from ACL_GROUP_OBJ if there's no ACL_MASK */ + if (gap && nomask) + va.va_mode |= gap->ae_perm << 3; + + VOP_SETATTR(vp, &va, 0, sys_cred, error); + return error; +} + +/* + * The permissions for the special ACL entries (u::, g::[m::], o::) are + * actually stored in the file mode (if there is both a group and a mask, + * the group is stored in the ACL entry and the mask is stored on the file). + * This allows the mode to remain automatically in sync with the ACL without + * the need for a call-back to the ACL system at every point where the mode + * could change. This function takes the permissions from the specified mode + * and places it in the supplied ACL. + * + * This implementation draws its validity from the fact that, when the ACL + * was assigned, the mode was copied from the ACL. + * If the mode did not change, therefore, the mode remains exactly what was + * taken from the special ACL entries at assignment. + * If a subsequent chmod() was done, the POSIX spec says that the change in + * mode must cause an update to the ACL seen at user level and used for + * access checks. Before and after a mode change, therefore, the file mode + * most accurately reflects what the special ACL entries should permit/deny. + * + * CAVEAT: If someone sets the SGI_ACL_FILE attribute directly, + * the existing mode bits will override whatever is in the + * ACL. Similarly, if there is a pre-existing ACL that was + * never in sync with its mode (owing to a bug in 6.5 and + * before), it will now magically (or mystically) be + * synchronized. This could cause slight astonishment, but + * it is better than inconsistent permissions. + * + * The supplied ACL is a template that may contain any combination + * of special entries. These are treated as place holders when we fill + * out the ACL. This routine does not add or remove special entries, it + * simply unites each special entry with its associated set of permissions. + */ +STATIC void +xfs_acl_sync_mode( + mode_t mode, + xfs_acl_t *acl) +{ + int i, nomask = 1; + xfs_acl_entry_t *ap; + xfs_acl_entry_t *gap = NULL; + + /* + * Set ACL entries. POSIX1003.1eD16 requires that the MASK + * be set instead of the GROUP entry, if there is a MASK. + */ + for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) { + switch (ap->ae_tag) { + case ACL_USER_OBJ: + ap->ae_perm = (mode >> 6) & 0x7; + break; + case ACL_GROUP_OBJ: + gap = ap; + break; + case ACL_MASK: + nomask = 0; + ap->ae_perm = (mode >> 3) & 0x7; + break; + case ACL_OTHER: + ap->ae_perm = mode & 0x7; + break; + default: + break; + } + } + /* Set the ACL_GROUP_OBJ if there's no ACL_MASK */ + if (gap && nomask) + gap->ae_perm = (mode >> 3) & 0x7; +} + +/* + * When inheriting an Access ACL from a directory Default ACL, + * the ACL bits are set to the intersection of the ACL default + * permission bits and the file permission bits in mode. If there + * are no permission bits on the file then we must not give them + * the ACL. This is what what makes umask() work with ACLs. + */ +STATIC void +xfs_acl_filter_mode( + mode_t mode, + xfs_acl_t *acl) +{ + int i, nomask = 1; + xfs_acl_entry_t *ap; + xfs_acl_entry_t *gap = NULL; + + /* + * Set ACL entries. POSIX1003.1eD16 requires that the MASK + * be merged with GROUP entry, if there is a MASK. + */ + for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) { + switch (ap->ae_tag) { + case ACL_USER_OBJ: + ap->ae_perm &= (mode >> 6) & 0x7; + break; + case ACL_GROUP_OBJ: + gap = ap; + break; + case ACL_MASK: + nomask = 0; + ap->ae_perm &= (mode >> 3) & 0x7; + break; + case ACL_OTHER: + ap->ae_perm &= mode & 0x7; + break; + default: + break; + } + } + /* Set the ACL_GROUP_OBJ if there's no ACL_MASK */ + if (gap && nomask) + gap->ae_perm &= (mode >> 3) & 0x7; +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_acl.h linux-2.4-xfs/fs/xfs/xfs_acl.h --- linux-2.4.19/fs/xfs/xfs_acl.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_acl.h Fri Aug 23 14:44:16 2002 @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_ACL_H__ +#define __XFS_ACL_H__ + +/* + * Access Control Lists + */ +typedef __uint16_t xfs_acl_perm_t; +typedef __int32_t xfs_acl_type_t; +typedef __int32_t xfs_acl_tag_t; +typedef __int32_t xfs_acl_id_t; + +#define XFS_ACL_MAX_ENTRIES 25 +#define XFS_ACL_NOT_PRESENT (-1) + +typedef struct xfs_acl_entry { + xfs_acl_tag_t ae_tag; + xfs_acl_id_t ae_id; + xfs_acl_perm_t ae_perm; +} xfs_acl_entry_t; + +typedef struct xfs_acl { + __int32_t acl_cnt; + xfs_acl_entry_t acl_entry[XFS_ACL_MAX_ENTRIES]; +} xfs_acl_t; + +/* On-disk XFS extended attribute names */ +#define SGI_ACL_FILE "SGI_ACL_FILE" +#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT" +#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1) +#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) + + +#ifdef __KERNEL__ + +#ifdef CONFIG_FS_POSIX_ACL + +struct vattr; +struct vnode; +struct xfs_inode; + +extern int xfs_acl_inherit(struct vnode *, struct vattr *, xfs_acl_t *); +extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *); +extern int xfs_acl_get(struct vnode *, xfs_acl_t *, xfs_acl_t *); +extern int xfs_acl_set(struct vnode *, xfs_acl_t *, xfs_acl_t *); +extern int xfs_acl_vtoacl(struct vnode *, xfs_acl_t *, xfs_acl_t *); +extern int xfs_acl_vhasacl_access(struct vnode *); +extern int xfs_acl_vhasacl_default(struct vnode *); +extern int xfs_acl_vset(struct vnode *, void *, size_t, int); +extern int xfs_acl_vget(struct vnode *, void *, size_t, int); +extern int xfs_acl_vremove(struct vnode *vp, int); + +extern struct kmem_zone *xfs_acl_zone; + +#define _ACL_TYPE_ACCESS 1 +#define _ACL_TYPE_DEFAULT 2 +#define _ACL_PERM_INVALID(perm) ((perm) & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE)) + +#define _ACL_DECL(a) xfs_acl_t *(a) = NULL +#define _ACL_ALLOC(a) ((a) = kmem_zone_alloc(xfs_acl_zone, KM_SLEEP)) +#define _ACL_FREE(a) ((a)? kmem_zone_free(xfs_acl_zone, (a)) : 0) +#define _ACL_ZONE_INIT(z,name) ((z) = kmem_zone_init(sizeof(xfs_acl_t), name)) +#define _ACL_ZONE_DESTROY(z) (kmem_cache_destroy(z)) +#define _ACL_INHERIT(c,v,d) (xfs_acl_inherit(c,v,d)) +#define _ACL_GET_ACCESS(pv,pa) (xfs_acl_vtoacl(pv,pa,NULL) == 0) +#define _ACL_GET_DEFAULT(pv,pd) (xfs_acl_vtoacl(pv,NULL,pd) == 0) +#define _ACL_ACCESS_EXISTS xfs_acl_vhasacl_access +#define _ACL_DEFAULT_EXISTS xfs_acl_vhasacl_default +#define _ACL_XFS_IACCESS(i,m,c) (XFS_IFORK_Q(i) ? xfs_acl_iaccess(i,m,c) : -1) + +#else +#define xfs_acl_vset(v,p,sz,t) (-EOPNOTSUPP) +#define xfs_acl_vget(v,p,sz,t) (-EOPNOTSUPP) +#define xfs_acl_vremove(v,t) (-EOPNOTSUPP) +#define _ACL_DECL(a) ((void)0) +#define _ACL_ALLOC(a) (1) /* successfully allocate nothing */ +#define _ACL_FREE(a) ((void)0) +#define _ACL_ZONE_INIT(z,name) ((void)0) +#define _ACL_ZONE_DESTROY(z) ((void)0) +#define _ACL_INHERIT(c,v,d) (0) +#define _ACL_GET_ACCESS(pv,pa) (0) +#define _ACL_GET_DEFAULT(pv,pd) (0) +#define _ACL_ACCESS_EXISTS (NULL) +#define _ACL_DEFAULT_EXISTS (NULL) +#define _ACL_XFS_IACCESS(i,m,c) (-1) +#endif + +#endif /* __KERNEL__ */ + +#endif /* __XFS_ACL_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_ag.h linux-2.4-xfs/fs/xfs/xfs_ag.h --- linux-2.4.19/fs/xfs/xfs_ag.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_ag.h Wed Jul 10 23:13:48 2002 @@ -0,0 +1,377 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_AG_H__ +#define __XFS_AG_H__ + +/* + * Allocation group header + * This is divided into three structures, placed in sequential 512-byte + * buffers after a copy of the superblock (also in a 512-byte buffer). + */ + +struct xfs_buf; +struct xfs_mount; +struct xfs_trans; + +#define XFS_AGF_MAGIC 0x58414746 /* 'XAGF' */ +#define XFS_AGI_MAGIC 0x58414749 /* 'XAGI' */ +#define XFS_AGF_VERSION 1 +#define XFS_AGI_VERSION 1 +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGF_GOOD_VERSION) +int xfs_agf_good_version(unsigned v); +#define XFS_AGF_GOOD_VERSION(v) xfs_agf_good_version(v) +#else +#define XFS_AGF_GOOD_VERSION(v) ((v) == XFS_AGF_VERSION) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGI_GOOD_VERSION) +int xfs_agi_good_version(unsigned v); +#define XFS_AGI_GOOD_VERSION(v) xfs_agi_good_version(v) +#else +#define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION) +#endif + +/* + * Btree number 0 is bno, 1 is cnt. This value gives the size of the + * arrays below. + */ +#define XFS_BTNUM_AGF ((int)XFS_BTNUM_CNTi + 1) + +/* + * The second word of agf_levels in the first a.g. overlaps the EFS + * superblock's magic number. Since the magic numbers valid for EFS + * are > 64k, our value cannot be confused for an EFS superblock's. + */ + +typedef struct xfs_agf +{ + /* + * Common allocation group header information + */ + __uint32_t agf_magicnum; /* magic number == XFS_AGF_MAGIC */ + __uint32_t agf_versionnum; /* header version == XFS_AGF_VERSION */ + xfs_agnumber_t agf_seqno; /* sequence # starting from 0 */ + xfs_agblock_t agf_length; /* size in blocks of a.g. */ + /* + * Freespace information + */ + xfs_agblock_t agf_roots[XFS_BTNUM_AGF]; /* root blocks */ + __uint32_t agf_spare0; /* spare field */ + __uint32_t agf_levels[XFS_BTNUM_AGF]; /* btree levels */ + __uint32_t agf_spare1; /* spare field */ + __uint32_t agf_flfirst; /* first freelist block's index */ + __uint32_t agf_fllast; /* last freelist block's index */ + __uint32_t agf_flcount; /* count of blocks in freelist */ + xfs_extlen_t agf_freeblks; /* total free blocks */ + xfs_extlen_t agf_longest; /* longest free space */ +} xfs_agf_t; + +#define XFS_AGF_MAGICNUM 0x00000001 +#define XFS_AGF_VERSIONNUM 0x00000002 +#define XFS_AGF_SEQNO 0x00000004 +#define XFS_AGF_LENGTH 0x00000008 +#define XFS_AGF_ROOTS 0x00000010 +#define XFS_AGF_LEVELS 0x00000020 +#define XFS_AGF_FLFIRST 0x00000040 +#define XFS_AGF_FLLAST 0x00000080 +#define XFS_AGF_FLCOUNT 0x00000100 +#define XFS_AGF_FREEBLKS 0x00000200 +#define XFS_AGF_LONGEST 0x00000400 +#define XFS_AGF_NUM_BITS 11 +#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) + +/* disk block (xfs_daddr_t) in the AG */ +#define XFS_AGF_DADDR ((xfs_daddr_t)1) +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGF_BLOCK) +xfs_agblock_t xfs_agf_block(struct xfs_mount *mp); +#define XFS_AGF_BLOCK(mp) xfs_agf_block(mp) +#else +#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR) +#endif + +/* + * Size of the unlinked inode hash table in the agi. + */ +#define XFS_AGI_UNLINKED_BUCKETS 64 + +typedef struct xfs_agi +{ + /* + * Common allocation group header information + */ + __uint32_t agi_magicnum; /* magic number == XFS_AGI_MAGIC */ + __uint32_t agi_versionnum; /* header version == XFS_AGI_VERSION */ + xfs_agnumber_t agi_seqno; /* sequence # starting from 0 */ + xfs_agblock_t agi_length; /* size in blocks of a.g. */ + /* + * Inode information + * Inodes are mapped by interpreting the inode number, so no + * mapping data is needed here. + */ + xfs_agino_t agi_count; /* count of allocated inodes */ + xfs_agblock_t agi_root; /* root of inode btree */ + __uint32_t agi_level; /* levels in inode btree */ + xfs_agino_t agi_freecount; /* number of free inodes */ + xfs_agino_t agi_newino; /* new inode just allocated */ + xfs_agino_t agi_dirino; /* last directory inode chunk */ + /* + * Hash table of inodes which have been unlinked but are + * still being referenced. + */ + xfs_agino_t agi_unlinked[XFS_AGI_UNLINKED_BUCKETS]; +} xfs_agi_t; + +#define XFS_AGI_MAGICNUM 0x00000001 +#define XFS_AGI_VERSIONNUM 0x00000002 +#define XFS_AGI_SEQNO 0x00000004 +#define XFS_AGI_LENGTH 0x00000008 +#define XFS_AGI_COUNT 0x00000010 +#define XFS_AGI_ROOT 0x00000020 +#define XFS_AGI_LEVEL 0x00000040 +#define XFS_AGI_FREECOUNT 0x00000080 +#define XFS_AGI_NEWINO 0x00000100 +#define XFS_AGI_DIRINO 0x00000200 +#define XFS_AGI_UNLINKED 0x00000400 +#define XFS_AGI_NUM_BITS 11 +#define XFS_AGI_ALL_BITS ((1 << XFS_AGI_NUM_BITS) - 1) + +/* disk block (xfs_daddr_t) in the AG */ +#define XFS_AGI_DADDR ((xfs_daddr_t)2) +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGI_BLOCK) +xfs_agblock_t xfs_agi_block(struct xfs_mount *mp); +#define XFS_AGI_BLOCK(mp) xfs_agi_block(mp) +#else +#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR) +#endif + +/* + * The third a.g. block contains the a.g. freelist, an array + * of block pointers to blocks owned by the allocation btree code. + */ +#define XFS_AGFL_DADDR ((xfs_daddr_t)3) +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGFL_BLOCK) +xfs_agblock_t xfs_agfl_block(struct xfs_mount *mp); +#define XFS_AGFL_BLOCK(mp) xfs_agfl_block(mp) +#else +#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR) +#endif +#define XFS_AGFL_SIZE (BBSIZE / sizeof(xfs_agblock_t)) +typedef struct xfs_agfl +{ + xfs_agblock_t agfl_bno[XFS_AGFL_SIZE]; +} xfs_agfl_t; + +/* + * Busy block/extent entry. Used in perag to mark blocks that have been freed + * but whose transactions aren't committed to disk yet. + */ +typedef struct xfs_perag_busy { + xfs_agblock_t busy_start; + xfs_extlen_t busy_length; + struct xfs_trans *busy_tp; /* transaction that did the free */ +} xfs_perag_busy_t; + +/* + * Per-ag incore structure, copies of information in agf and agi, + * to improve the performance of allocation group selection. + * + * pick sizes which fit in allocation buckets well + */ +#if (BITS_PER_LONG == 32) +#define XFS_PAGB_NUM_SLOTS 84 +#elif (BITS_PER_LONG == 64) +#define XFS_PAGB_NUM_SLOTS 128 +#endif + +typedef struct xfs_perag +{ + char pagf_init; /* this agf's entry is initialized */ + char pagi_init; /* this agi's entry is initialized */ + char pagf_metadata; /* the agf is prefered to be metadata */ + char pagi_inodeok; /* The agi is ok for inodes */ + __uint8_t pagf_levels[XFS_BTNUM_AGF]; + /* # of levels in bno & cnt btree */ + __uint32_t pagf_flcount; /* count of blocks in freelist */ + xfs_extlen_t pagf_freeblks; /* total free blocks */ + xfs_extlen_t pagf_longest; /* longest free space */ + xfs_agino_t pagi_freecount; /* number of free inodes */ +#ifdef __KERNEL__ + lock_t pagb_lock; /* lock for pagb_list */ +#endif + int pagb_count; /* pagb slots in use */ + xfs_perag_busy_t *pagb_list; /* unstable blocks */ +} xfs_perag_t; + +#define XFS_AG_MIN_BYTES (1LL << 24) /* 16 MB */ +#define XFS_AG_BEST_BYTES (1LL << 30) /* 1 GB */ +#define XFS_AG_MAX_BYTES (1LL << 32) /* 4 GB */ + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_MIN_BLOCKS) +xfs_extlen_t xfs_ag_min_blocks(int bl); +#define XFS_AG_MIN_BLOCKS(bl) xfs_ag_min_blocks(bl) +#else +#define XFS_AG_MIN_BLOCKS(bl) ((xfs_extlen_t)(XFS_AG_MIN_BYTES >> bl)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_BEST_BLOCKS) +xfs_extlen_t xfs_ag_best_blocks(int bl, xfs_drfsbno_t blks); +#define XFS_AG_BEST_BLOCKS(bl,blks) xfs_ag_best_blocks(bl,blks) +#else +/*--#define XFS_AG_BEST_BLOCKS(bl) ((xfs_extlen_t)(XFS_AG_BEST_BYTES >> bl))*/ +/* + * Best is XFS_AG_BEST_BLOCKS at and below 64 Gigabyte filesystems, and + * XFS_AG_MAX_BLOCKS above 64 Gigabytes. + */ +#define XFS_AG_BEST_BLOCKS(bl,blks) ((xfs_extlen_t)((1LL << (36 - bl)) >= \ + blks) ? \ + ((xfs_extlen_t)(XFS_AG_BEST_BYTES >> bl)) : \ + XFS_AG_MAX_BLOCKS(bl)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_MAX_BLOCKS) +xfs_extlen_t xfs_ag_max_blocks(int bl); +#define XFS_AG_MAX_BLOCKS(bl) xfs_ag_max_blocks(bl) +#else +#define XFS_AG_MAX_BLOCKS(bl) ((xfs_extlen_t)(XFS_AG_MAX_BYTES >> bl)) +#endif + +#define XFS_MAX_AGNUMBER ((xfs_agnumber_t)(NULLAGNUMBER - 1)) + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_MAXLEVELS) +int xfs_ag_maxlevels(struct xfs_mount *mp); +#define XFS_AG_MAXLEVELS(mp) xfs_ag_maxlevels(mp) +#else +#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MIN_FREELIST) +int xfs_min_freelist(xfs_agf_t *a, struct xfs_mount *mp); +#define XFS_MIN_FREELIST(a,mp) xfs_min_freelist(a,mp) +#else +#define XFS_MIN_FREELIST(a,mp) \ + XFS_MIN_FREELIST_RAW( \ + INT_GET((a)->agf_levels[XFS_BTNUM_BNOi], ARCH_CONVERT), \ + INT_GET((a)->agf_levels[XFS_BTNUM_CNTi], ARCH_CONVERT), mp) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MIN_FREELIST_PAG) +int xfs_min_freelist_pag(xfs_perag_t *pag, struct xfs_mount *mp); +#define XFS_MIN_FREELIST_PAG(pag,mp) xfs_min_freelist_pag(pag,mp) +#else +#define XFS_MIN_FREELIST_PAG(pag,mp) \ + XFS_MIN_FREELIST_RAW((uint_t)(pag)->pagf_levels[XFS_BTNUM_BNOi], \ + (uint_t)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MIN_FREELIST_RAW) +int xfs_min_freelist_raw(int bl, int cl, struct xfs_mount *mp); +#define XFS_MIN_FREELIST_RAW(bl,cl,mp) xfs_min_freelist_raw(bl,cl,mp) +#else +#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ + (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + \ + MIN(cl + 1, XFS_AG_MAXLEVELS(mp))) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGB_TO_FSB) +xfs_fsblock_t xfs_agb_to_fsb(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno); +#define XFS_AGB_TO_FSB(mp,agno,agbno) xfs_agb_to_fsb(mp,agno,agbno) +#else +#define XFS_AGB_TO_FSB(mp,agno,agbno) \ + (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_TO_AGNO) +xfs_agnumber_t xfs_fsb_to_agno(struct xfs_mount *mp, xfs_fsblock_t fsbno); +#define XFS_FSB_TO_AGNO(mp,fsbno) xfs_fsb_to_agno(mp,fsbno) +#else +#define XFS_FSB_TO_AGNO(mp,fsbno) \ + ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_TO_AGBNO) +xfs_agblock_t xfs_fsb_to_agbno(struct xfs_mount *mp, xfs_fsblock_t fsbno); +#define XFS_FSB_TO_AGBNO(mp,fsbno) xfs_fsb_to_agbno(mp,fsbno) +#else +#define XFS_FSB_TO_AGBNO(mp,fsbno) \ + ((xfs_agblock_t)((fsbno) & XFS_MASK32LO((mp)->m_sb.sb_agblklog))) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGB_TO_DADDR) +xfs_daddr_t xfs_agb_to_daddr(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno); +#define XFS_AGB_TO_DADDR(mp,agno,agbno) xfs_agb_to_daddr(mp,agno,agbno) +#else +#define XFS_AGB_TO_DADDR(mp,agno,agbno) \ + ((xfs_daddr_t)(XFS_FSB_TO_BB(mp, \ + (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno)))) +#endif +/* + * XFS_DADDR_TO_AGNO and XFS_DADDR_TO_AGBNO moved to xfs_mount.h + * to avoid header file ordering change + */ + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_DADDR) +xfs_daddr_t xfs_ag_daddr(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_daddr_t d); +#define XFS_AG_DADDR(mp,agno,d) xfs_ag_daddr(mp,agno,d) +#else +#define XFS_AG_DADDR(mp,agno,d) (XFS_AGB_TO_DADDR(mp, agno, 0) + (d)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_AGF) +xfs_agf_t *xfs_buf_to_agf(struct xfs_buf *bp); +#define XFS_BUF_TO_AGF(bp) xfs_buf_to_agf(bp) +#else +#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)XFS_BUF_PTR(bp)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_AGI) +xfs_agi_t *xfs_buf_to_agi(struct xfs_buf *bp); +#define XFS_BUF_TO_AGI(bp) xfs_buf_to_agi(bp) +#else +#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)XFS_BUF_PTR(bp)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_AGFL) +xfs_agfl_t *xfs_buf_to_agfl(struct xfs_buf *bp); +#define XFS_BUF_TO_AGFL(bp) xfs_buf_to_agfl(bp) +#else +#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)XFS_BUF_PTR(bp)) +#endif + +/* + * For checking for bad ranges of xfs_daddr_t's, covering multiple + * allocation groups or a single xfs_daddr_t that's a superblock copy. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_CHECK_DADDR) +void xfs_ag_check_daddr(struct xfs_mount *mp, xfs_daddr_t d, xfs_extlen_t len); +#define XFS_AG_CHECK_DADDR(mp,d,len) xfs_ag_check_daddr(mp,d,len) +#else +#define XFS_AG_CHECK_DADDR(mp,d,len) \ + ((len) == 1 ? \ + ASSERT((d) == XFS_SB_DADDR || \ + XFS_DADDR_TO_AGBNO(mp, d) != XFS_SB_DADDR) : \ + ASSERT(XFS_DADDR_TO_AGNO(mp, d) == \ + XFS_DADDR_TO_AGNO(mp, (d) + (len) - 1))) +#endif + +#endif /* __XFS_AG_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_alloc.c linux-2.4-xfs/fs/xfs/xfs_alloc.c --- linux-2.4.19/fs/xfs/xfs_alloc.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_alloc.c Thu Aug 8 20:03:32 2002 @@ -0,0 +1,2639 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * Free space allocation for XFS. + */ +#include + +#if defined(DEBUG) +/* + * Allocation tracing. + */ +ktrace_t *xfs_alloc_trace_buf; +#endif + +#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b))) + +#define XFSA_FIXUP_BNO_OK 1 +#define XFSA_FIXUP_CNT_OK 2 + +int +xfs_alloc_search_busy(xfs_trans_t *tp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len); + +#if defined(XFS_ALLOC_TRACE) +#define TRACE_ALLOC(s,a) \ + xfs_alloc_trace_alloc(fname, s, a, __LINE__) +#define TRACE_FREE(s,a,b,x,f) \ + xfs_alloc_trace_free(fname, s, mp, a, b, x, f, __LINE__) +#define TRACE_MODAGF(s,a,f) \ + xfs_alloc_trace_modagf(fname, s, mp, a, f, __LINE__) +#define TRACE_BUSY(fname,s,ag,agb,l,sl,tp) \ + xfs_alloc_trace_busy(fname, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__) +#define TRACE_UNBUSY(fname,s,ag,sl,tp) \ + xfs_alloc_trace_busy(fname, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__) +#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp) \ + xfs_alloc_trace_busy(fname, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__) + + +#else +#define TRACE_ALLOC(s,a) +#define TRACE_FREE(s,a,b,x,f) +#define TRACE_MODAGF(s,a,f) +#define TRACE_BUSY(s,a,ag,agb,l,sl,tp) +#define TRACE_UNBUSY(fname,s,ag,sl,tp) +#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp) +#endif /* XFS_ALLOC_TRACE */ + +/* + * Prototypes for per-ag allocation routines + */ + +STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *); +STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *); +STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); +STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, + xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); + +/* + * Internal functions. + */ + +/* + * Compute aligned version of the found extent. + * Takes alignment and min length into account. + */ +STATIC int /* success (>= minlen) */ +xfs_alloc_compute_aligned( + xfs_agblock_t foundbno, /* starting block in found extent */ + xfs_extlen_t foundlen, /* length in found extent */ + xfs_extlen_t alignment, /* alignment for allocation */ + xfs_extlen_t minlen, /* minimum length for allocation */ + xfs_agblock_t *resbno, /* result block number */ + xfs_extlen_t *reslen) /* result length */ +{ + xfs_agblock_t bno; + xfs_extlen_t diff; + xfs_extlen_t len; + + if (alignment > 1 && foundlen >= minlen) { + bno = roundup(foundbno, alignment); + diff = bno - foundbno; + len = diff >= foundlen ? 0 : foundlen - diff; + } else { + bno = foundbno; + len = foundlen; + } + *resbno = bno; + *reslen = len; + return len >= minlen; +} + +/* + * Compute best start block and diff for "near" allocations. + * freelen >= wantlen already checked by caller. + */ +STATIC xfs_extlen_t /* difference value (absolute) */ +xfs_alloc_compute_diff( + xfs_agblock_t wantbno, /* target starting block */ + xfs_extlen_t wantlen, /* target length */ + xfs_extlen_t alignment, /* target alignment */ + xfs_agblock_t freebno, /* freespace's starting block */ + xfs_extlen_t freelen, /* freespace's length */ + xfs_agblock_t *newbnop) /* result: best start block from free */ +{ + xfs_agblock_t freeend; /* end of freespace extent */ + xfs_agblock_t newbno1; /* return block number */ + xfs_agblock_t newbno2; /* other new block number */ + xfs_extlen_t newlen1=0; /* length with newbno1 */ + xfs_extlen_t newlen2=0; /* length with newbno2 */ + xfs_agblock_t wantend; /* end of target extent */ + + ASSERT(freelen >= wantlen); + freeend = freebno + freelen; + wantend = wantbno + wantlen; + if (freebno >= wantbno) { + if ((newbno1 = roundup(freebno, alignment)) >= freeend) + newbno1 = NULLAGBLOCK; + } else if (freeend >= wantend && alignment > 1) { + newbno1 = roundup(wantbno, alignment); + newbno2 = newbno1 - alignment; + if (newbno1 >= freeend) + newbno1 = NULLAGBLOCK; + else + newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1); + if (newbno2 < freebno) + newbno2 = NULLAGBLOCK; + else + newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2); + if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) { + if (newlen1 < newlen2 || + (newlen1 == newlen2 && + XFS_ABSDIFF(newbno1, wantbno) > + XFS_ABSDIFF(newbno2, wantbno))) + newbno1 = newbno2; + } else if (newbno2 != NULLAGBLOCK) + newbno1 = newbno2; + } else if (freeend >= wantend) { + newbno1 = wantbno; + } else if (alignment > 1) { + newbno1 = roundup(freeend - wantlen, alignment); + if (newbno1 > freeend - wantlen && + newbno1 - alignment >= freebno) + newbno1 -= alignment; + else if (newbno1 >= freeend) + newbno1 = NULLAGBLOCK; + } else + newbno1 = freeend - wantlen; + *newbnop = newbno1; + return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno); +} + +/* + * Fix up the length, based on mod and prod. + * len should be k * prod + mod for some k. + * If len is too small it is returned unchanged. + * If len hits maxlen it is left alone. + */ +STATIC void +xfs_alloc_fix_len( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_extlen_t k; + xfs_extlen_t rlen; + + ASSERT(args->mod < args->prod); + rlen = args->len; + ASSERT(rlen >= args->minlen); + ASSERT(rlen <= args->maxlen); + if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen || + (args->mod == 0 && rlen < args->prod)) + return; + k = rlen % args->prod; + if (k == args->mod) + return; + if (k > args->mod) { + if ((int)(rlen = rlen - k - args->mod) < (int)args->minlen) + return; + } else { + if ((int)(rlen = rlen - args->prod - (args->mod - k)) < + (int)args->minlen) + return; + } + ASSERT(rlen >= args->minlen); + ASSERT(rlen <= args->maxlen); + args->len = rlen; +} + +/* + * Fix up length if there is too little space left in the a.g. + * Return 1 if ok, 0 if too little, should give up. + */ +STATIC int +xfs_alloc_fix_minleft( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_agf_t *agf; /* a.g. freelist header */ + int diff; /* free space difference */ + + if (args->minleft == 0) + return 1; + agf = XFS_BUF_TO_AGF(args->agbp); + diff = INT_GET(agf->agf_freeblks, ARCH_CONVERT) + + INT_GET(agf->agf_flcount, ARCH_CONVERT) + - args->len - args->minleft; + if (diff >= 0) + return 1; + args->len += diff; /* shrink the allocated space */ + if (args->len >= args->minlen) + return 1; + args->agbno = NULLAGBLOCK; + return 0; +} + +/* + * Update the two btrees, logically removing from freespace the extent + * starting at rbno, rlen blocks. The extent is contained within the + * actual (current) free extent fbno for flen blocks. + * Flags are passed in indicating whether the cursors are set to the + * relevant records. + */ +STATIC int /* error code */ +xfs_alloc_fixup_trees( + xfs_btree_cur_t *cnt_cur, /* cursor for by-size btree */ + xfs_btree_cur_t *bno_cur, /* cursor for by-block btree */ + xfs_agblock_t fbno, /* starting block of free extent */ + xfs_extlen_t flen, /* length of free extent */ + xfs_agblock_t rbno, /* starting block of returned extent */ + xfs_extlen_t rlen, /* length of returned extent */ + int flags) /* flags, XFSA_FIXUP_... */ +{ + int error; /* error code */ + int i; /* operation results */ + xfs_agblock_t nfbno1; /* first new free startblock */ + xfs_agblock_t nfbno2; /* second new free startblock */ + xfs_extlen_t nflen1=0; /* first new free length */ + xfs_extlen_t nflen2=0; /* second new free length */ + + /* + * Look up the record in the by-size tree if necessary. + */ + if (flags & XFSA_FIXUP_CNT_OK) { +#ifdef DEBUG + if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN( + i == 1 && nfbno1 == fbno && nflen1 == flen); +#endif + } else { + if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + } + /* + * Look up the record in the by-block tree if necessary. + */ + if (flags & XFSA_FIXUP_BNO_OK) { +#ifdef DEBUG + if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN( + i == 1 && nfbno1 == fbno && nflen1 == flen); +#endif + } else { + if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + } +#ifdef DEBUG + { + xfs_alloc_block_t *bnoblock; + xfs_alloc_block_t *cntblock; + + if (bno_cur->bc_nlevels == 1 && + cnt_cur->bc_nlevels == 1) { + bnoblock = XFS_BUF_TO_ALLOC_BLOCK(bno_cur->bc_bufs[0]); + cntblock = XFS_BUF_TO_ALLOC_BLOCK(cnt_cur->bc_bufs[0]); + XFS_WANT_CORRUPTED_RETURN( + INT_GET(bnoblock->bb_numrecs, ARCH_CONVERT) == INT_GET(cntblock->bb_numrecs, ARCH_CONVERT)); + } + } +#endif + /* + * Deal with all four cases: the allocated record is contained + * within the freespace record, so we can have new freespace + * at either (or both) end, or no freespace remaining. + */ + if (rbno == fbno && rlen == flen) + nfbno1 = nfbno2 = NULLAGBLOCK; + else if (rbno == fbno) { + nfbno1 = rbno + rlen; + nflen1 = flen - rlen; + nfbno2 = NULLAGBLOCK; + } else if (rbno + rlen == fbno + flen) { + nfbno1 = fbno; + nflen1 = flen - rlen; + nfbno2 = NULLAGBLOCK; + } else { + nfbno1 = fbno; + nflen1 = rbno - fbno; + nfbno2 = rbno + rlen; + nflen2 = (fbno + flen) - nfbno2; + } + /* + * Delete the entry from the by-size btree. + */ + if ((error = xfs_alloc_delete(cnt_cur, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + /* + * Add new by-size btree entry(s). + */ + if (nfbno1 != NULLAGBLOCK) { + if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 0); + if ((error = xfs_alloc_insert(cnt_cur, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + } + if (nfbno2 != NULLAGBLOCK) { + if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 0); + if ((error = xfs_alloc_insert(cnt_cur, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + } + /* + * Fix up the by-block btree entry(s). + */ + if (nfbno1 == NULLAGBLOCK) { + /* + * No remaining freespace, just delete the by-block tree entry. + */ + if ((error = xfs_alloc_delete(bno_cur, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + } else { + /* + * Update the by-block entry to start later|be shorter. + */ + if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1))) + return error; + } + if (nfbno2 != NULLAGBLOCK) { + /* + * 2 resulting free entries, need to add one. + */ + if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 0); + if ((error = xfs_alloc_insert(bno_cur, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + } + return 0; +} + +/* + * Read in the allocation group free block array. + */ +STATIC int /* error */ +xfs_alloc_read_agfl( + xfs_mount_t *mp, /* mount point structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_buf_t **bpp) /* buffer for the ag free block array */ +{ + xfs_buf_t *bp; /* return value */ + xfs_daddr_t d; /* disk block address */ + int error; + + ASSERT(agno != NULLAGNUMBER); + d = XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR); + if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, 1, 0, &bp))) + return error; + ASSERT(bp); + ASSERT(!XFS_BUF_GETERROR(bp)); + XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF); + *bpp = bp; + return 0; +} + +#if defined(XFS_ALLOC_TRACE) +/* + * Add an allocation trace entry for an alloc call. + */ +STATIC void +xfs_alloc_trace_alloc( + char *name, /* function tag string */ + char *str, /* additional string */ + xfs_alloc_arg_t *args, /* allocation argument structure */ + int line) /* source line number */ +{ + ktrace_enter(xfs_alloc_trace_buf, + (void *)(__psint_t)(XFS_ALLOC_KTRACE_ALLOC | (line << 16)), + (void *)name, + (void *)str, + (void *)args->mp, + (void *)(__psunsigned_t)args->agno, + (void *)(__psunsigned_t)args->agbno, + (void *)(__psunsigned_t)args->minlen, + (void *)(__psunsigned_t)args->maxlen, + (void *)(__psunsigned_t)args->mod, + (void *)(__psunsigned_t)args->prod, + (void *)(__psunsigned_t)args->minleft, + (void *)(__psunsigned_t)args->total, + (void *)(__psunsigned_t)args->alignment, + (void *)(__psunsigned_t)args->len, + (void *)((((__psint_t)args->type) << 16) | + (__psint_t)args->otype), + (void *)(__psint_t)((args->wasdel << 3) | + (args->wasfromfl << 2) | + (args->isfl << 1) | + (args->userdata << 0))); +} + +/* + * Add an allocation trace entry for a free call. + */ +STATIC void +xfs_alloc_trace_free( + char *name, /* function tag string */ + char *str, /* additional string */ + xfs_mount_t *mp, /* file system mount point */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* a.g. relative block number */ + xfs_extlen_t len, /* length of extent */ + int isfl, /* set if is freelist allocation/free */ + int line) /* source line number */ +{ + ktrace_enter(xfs_alloc_trace_buf, + (void *)(__psint_t)(XFS_ALLOC_KTRACE_FREE | (line << 16)), + (void *)name, + (void *)str, + (void *)mp, + (void *)(__psunsigned_t)agno, + (void *)(__psunsigned_t)agbno, + (void *)(__psunsigned_t)len, + (void *)(__psint_t)isfl, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL); +} + +/* + * Add an allocation trace entry for modifying an agf. + */ +STATIC void +xfs_alloc_trace_modagf( + char *name, /* function tag string */ + char *str, /* additional string */ + xfs_mount_t *mp, /* file system mount point */ + xfs_agf_t *agf, /* new agf value */ + int flags, /* logging flags for agf */ + int line) /* source line number */ +{ + ktrace_enter(xfs_alloc_trace_buf, + (void *)(__psint_t)(XFS_ALLOC_KTRACE_MODAGF | (line << 16)), + (void *)name, + (void *)str, + (void *)mp, + (void *)(__psint_t)flags, + (void *)(__psunsigned_t)INT_GET(agf->agf_seqno, ARCH_CONVERT), + (void *)(__psunsigned_t)INT_GET(agf->agf_length, ARCH_CONVERT), + (void *)(__psunsigned_t)INT_GET(agf->agf_roots[XFS_BTNUM_BNO], + ARCH_CONVERT); + (void *)(__psunsigned_t)INT_GET(agf->agf_roots[XFS_BTNUM_CNT], + ARCH_CONVERT); + (void *)(__psunsigned_t)INT_GET(agf->agf_levels[XFS_BTNUM_BNO], + ARCH_CONVERT); + (void *)(__psunsigned_t)INT_GET(agf->agf_levels[XFS_BTNUM_CNT], + ARCH_CONVERT); + (void *)(__psunsigned_t)INT_GET(agf->agf_flfirst, ARCH_CONVERT), + (void *)(__psunsigned_t)INT_GET(agf->agf_fllast, ARCH_CONVERT), + (void *)(__psunsigned_t)INT_GET(agf->agf_flcount, ARCH_CONVERT), + (void *)(__psunsigned_t)INT_GET(agf->agf_freeblks, ARCH_CONVERT), + (void *)(__psunsigned_t)INT_GET(agf->agf_longest, ARCH_CONVERT)); +} + +STATIC void +xfs_alloc_trace_busy( + char *name, /* function tag string */ + char *str, /* additional string */ + xfs_mount_t *mp, /* file system mount poing */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* a.g. relative block number */ + xfs_extlen_t len, /* length of extent */ + int slot, /* perag Busy slot */ + xfs_trans_t *tp, + int trtype, /* type: add, delete, search */ + int line) /* source line number */ +{ + ktrace_enter(xfs_alloc_trace_buf, + (void *)(__psint_t)(trtype | (line << 16)), + (void *)name, + (void *)str, + (void *)mp, + (void *)(__psunsigned_t)agno, + (void *)(__psunsigned_t)agbno, + (void *)(__psunsigned_t)len, + (void *)(__psint_t)slot, + (void *)tp, + NULL, NULL, NULL, NULL, NULL, NULL, NULL); +} +#endif /* XFS_ALLOC_TRACE */ + +/* + * Allocation group level functions. + */ + +/* + * Allocate a variable extent in the allocation group agno. + * Type and bno are used to determine where in the allocation group the + * extent will start. + * Extent's length (returned in *len) will be between minlen and maxlen, + * and of the form k * prod + mod unless there's nothing that large. + * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent( + xfs_alloc_arg_t *args) /* argument structure for allocation */ +{ + int error=0; +#ifdef XFS_ALLOC_TRACE + static char fname[] = "xfs_alloc_ag_vextent"; +#endif + + ASSERT(args->minlen > 0); + ASSERT(args->maxlen > 0); + ASSERT(args->minlen <= args->maxlen); + ASSERT(args->mod < args->prod); + ASSERT(args->alignment > 0); + /* + * Branch to correct routine based on the type. + */ + args->wasfromfl = 0; + switch (args->type) { + case XFS_ALLOCTYPE_THIS_AG: + error = xfs_alloc_ag_vextent_size(args); + break; + case XFS_ALLOCTYPE_NEAR_BNO: + error = xfs_alloc_ag_vextent_near(args); + break; + case XFS_ALLOCTYPE_THIS_BNO: + error = xfs_alloc_ag_vextent_exact(args); + break; + default: + ASSERT(0); + /* NOTREACHED */ + } + if (error) + return error; + /* + * If the allocation worked, need to change the agf structure + * (and log it), and the superblock. + */ + if (args->agbno != NULLAGBLOCK) { + xfs_agf_t *agf; /* allocation group freelist header */ +#ifdef XFS_ALLOC_TRACE + xfs_mount_t *mp = args->mp; +#endif + long slen = (long)args->len; + + ASSERT(args->len >= args->minlen && args->len <= args->maxlen); + ASSERT(!(args->wasfromfl) || !args->isfl); + ASSERT(args->agbno % args->alignment == 0); + if (!(args->wasfromfl)) { + + agf = XFS_BUF_TO_AGF(args->agbp); + INT_MOD(agf->agf_freeblks, ARCH_CONVERT, -(args->len)); + xfs_trans_agblocks_delta(args->tp, + -((long)(args->len))); + args->pag->pagf_freeblks -= args->len; + ASSERT(INT_GET(agf->agf_freeblks, ARCH_CONVERT) + <= INT_GET(agf->agf_length, ARCH_CONVERT)); + TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS); + xfs_alloc_log_agf(args->tp, args->agbp, + XFS_AGF_FREEBLKS); + /* search the busylist for these blocks */ + xfs_alloc_search_busy(args->tp, args->agno, + args->agbno, args->len); + } + if (!args->isfl) + xfs_trans_mod_sb(args->tp, + args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS : + XFS_TRANS_SB_FDBLOCKS, -slen); + XFS_STATS_INC(xfsstats.xs_allocx); + XFS_STATS_ADD(xfsstats.xs_allocb, args->len); + } + return 0; +} + +/* + * Allocate a variable extent at exactly agno/bno. + * Extent's length (returned in *len) will be between minlen and maxlen, + * and of the form k * prod + mod unless there's nothing that large. + * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent_exact( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */ + xfs_btree_cur_t *cnt_cur;/* by count btree cursor */ + xfs_agblock_t end; /* end of allocated extent */ + int error; + xfs_agblock_t fbno; /* start block of found extent */ + xfs_agblock_t fend; /* end block of found extent */ + xfs_extlen_t flen; /* length of found extent */ +#ifdef XFS_ALLOC_TRACE + static char fname[] = "xfs_alloc_ag_vextent_exact"; +#endif + int i; /* success/failure of operation */ + xfs_agblock_t maxend; /* end of maximal extent */ + xfs_agblock_t minend; /* end of minimal extent */ + xfs_extlen_t rlen; /* length of returned extent */ + + ASSERT(args->alignment == 1); + /* + * Allocate/initialize a cursor for the by-number freespace btree. + */ + bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_BNO, 0, 0); + /* + * Lookup bno and minlen in the btree (minlen is irrelevant, really). + * Look for the closest free block <= bno, it must contain bno + * if any free block does. + */ + if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i))) + goto error0; + if (!i) { + /* + * Didn't find it, return null. + */ + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + args->agbno = NULLAGBLOCK; + return 0; + } + /* + * Grab the freespace record. + */ + if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + ASSERT(fbno <= args->agbno); + minend = args->agbno + args->minlen; + maxend = args->agbno + args->maxlen; + fend = fbno + flen; + /* + * Give up if the freespace isn't long enough for the minimum request. + */ + if (fend < minend) { + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + args->agbno = NULLAGBLOCK; + return 0; + } + /* + * End of extent will be smaller of the freespace end and the + * maximal requested end. + */ + end = XFS_AGBLOCK_MIN(fend, maxend); + /* + * Fix the length according to mod and prod if given. + */ + args->len = end - args->agbno; + xfs_alloc_fix_len(args); + if (!xfs_alloc_fix_minleft(args)) { + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + return 0; + } + rlen = args->len; + ASSERT(args->agbno + rlen <= fend); + end = args->agbno + rlen; + /* + * We are allocating agbno for rlen [agbno .. end] + * Allocate/initialize a cursor for the by-size btree. + */ + cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_CNT, 0, 0); + ASSERT(args->agbno + args->len <= + INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length, + ARCH_CONVERT)); + if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, + args->agbno, args->len, XFSA_FIXUP_BNO_OK))) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); + goto error0; + } + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + TRACE_ALLOC("normal", args); + args->wasfromfl = 0; + return 0; + +error0: + xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); + TRACE_ALLOC("error", args); + return error; +} + +/* + * Allocate a variable extent near bno in the allocation group agno. + * Extent's length (returned in len) will be between minlen and maxlen, + * and of the form k * prod + mod unless there's nothing that large. + * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent_near( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_btree_cur_t *bno_cur_gt; /* cursor for bno btree, right side */ + xfs_btree_cur_t *bno_cur_lt; /* cursor for bno btree, left side */ + xfs_btree_cur_t *cnt_cur; /* cursor for count btree */ +#ifdef XFS_ALLOC_TRACE + static char fname[] = "xfs_alloc_ag_vextent_near"; +#endif + xfs_agblock_t gtbno; /* start bno of right side entry */ + xfs_agblock_t gtbnoa; /* aligned ... */ + xfs_extlen_t gtdiff; /* difference to right side entry */ + xfs_extlen_t gtlen; /* length of right side entry */ + xfs_extlen_t gtlena; /* aligned ... */ + xfs_agblock_t gtnew; /* useful start bno of right side */ + int error; /* error code */ + int i; /* result code, temporary */ + int j; /* result code, temporary */ + xfs_agblock_t ltbno; /* start bno of left side entry */ + xfs_agblock_t ltbnoa; /* aligned ... */ + xfs_extlen_t ltdiff; /* difference to left side entry */ + /*REFERENCED*/ + xfs_agblock_t ltend; /* end bno of left side entry */ + xfs_extlen_t ltlen; /* length of left side entry */ + xfs_extlen_t ltlena; /* aligned ... */ + xfs_agblock_t ltnew; /* useful start bno of left side */ + xfs_extlen_t rlen; /* length of returned extent */ +#if defined(DEBUG) && defined(__KERNEL__) + /* + * Randomly don't execute the first algorithm. + */ + static int seed; /* randomizing seed value */ + int dofirst; /* set to do first algorithm */ + timespec_t now; /* current time */ + + if (!seed) { + nanotime(&now); + seed = (int)now.tv_sec ^ (int)now.tv_nsec; + } + dofirst = random() & 1; +#endif + /* + * Get a cursor for the by-size btree. + */ + cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_CNT, 0, 0); + ltlen = 0; + bno_cur_lt = bno_cur_gt = NULL; + /* + * See if there are any free extents as big as maxlen. + */ + if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen, &i))) + goto error0; + /* + * If none, then pick up the last entry in the tree unless the + * tree is empty. + */ + if (!i) { + if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, <bno, + <len, &i))) + goto error0; + if (i == 0 || ltlen == 0) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + return 0; + } + ASSERT(i == 1); + } + args->wasfromfl = 0; + /* + * First algorithm. + * If the requested extent is large wrt the freespaces available + * in this a.g., then the cursor will be pointing to a btree entry + * near the right edge of the tree. If it's in the last btree leaf + * block, then we just examine all the entries in that block + * that are big enough, and pick the best one. + * This is written as a while loop so we can break out of it, + * but we never loop back to the top. + */ + while (xfs_btree_islastblock(cnt_cur, 0)) { + xfs_extlen_t bdiff; + int besti=0; + xfs_extlen_t blen=0; + xfs_agblock_t bnew=0; + +#if defined(DEBUG) && defined(__KERNEL__) + if (!dofirst) + break; +#endif + /* + * Start from the entry that lookup found, sequence through + * all larger free blocks. If we're actually pointing at a + * record smaller than maxlen, go to the start of this block, + * and skip all those smaller than minlen. + */ + if (ltlen || args->alignment > 1) { + cnt_cur->bc_ptrs[0] = 1; + do { + if ((error = xfs_alloc_get_rec(cnt_cur, <bno, + <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if (ltlen >= args->minlen) + break; + if ((error = xfs_alloc_increment(cnt_cur, 0, &i))) + goto error0; + } while (i); + ASSERT(ltlen >= args->minlen); + if (!i) + break; + } + i = cnt_cur->bc_ptrs[0]; + for (j = 1, blen = 0, bdiff = 0; + !error && j && (blen < args->maxlen || bdiff > 0); + error = xfs_alloc_increment(cnt_cur, 0, &j)) { + /* + * For each entry, decide if it's better than + * the previous best entry. + */ + if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if (!xfs_alloc_compute_aligned(ltbno, ltlen, + args->alignment, args->minlen, + <bnoa, <lena)) + continue; + args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); + xfs_alloc_fix_len(args); + ASSERT(args->len >= args->minlen); + if (args->len < blen) + continue; + ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, + args->alignment, ltbno, ltlen, <new); + if (ltnew != NULLAGBLOCK && + (args->len > blen || ltdiff < bdiff)) { + bdiff = ltdiff; + bnew = ltnew; + blen = args->len; + besti = cnt_cur->bc_ptrs[0]; + } + } + /* + * It didn't work. We COULD be in a case where + * there's a good record somewhere, so try again. + */ + if (blen == 0) + break; + /* + * Point at the best entry, and retrieve it again. + */ + cnt_cur->bc_ptrs[0] = besti; + if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + ltend = ltbno + ltlen; + ASSERT(ltend <= INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length, + ARCH_CONVERT)); + args->len = blen; + if (!xfs_alloc_fix_minleft(args)) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + TRACE_ALLOC("nominleft", args); + return 0; + } + blen = args->len; + /* + * We are allocating starting at bnew for blen blocks. + */ + args->agbno = bnew; + ASSERT(bnew >= ltbno); + ASSERT(bnew + blen <= ltend); + /* + * Set up a cursor for the by-bno tree. + */ + bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp, + args->agbp, args->agno, XFS_BTNUM_BNO, 0, 0); + /* + * Fix up the btree entries. + */ + if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, + ltlen, bnew, blen, XFSA_FIXUP_CNT_OK))) + goto error0; + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); + TRACE_ALLOC("first", args); + return 0; + } + /* + * Second algorithm. + * Search in the by-bno tree to the left and to the right + * simultaneously, until in each case we find a space big enough, + * or run into the edge of the tree. When we run into the edge, + * we deallocate that cursor. + * If both searches succeed, we compare the two spaces and pick + * the better one. + * With alignment, it's possible for both to fail; the upper + * level algorithm that picks allocation groups for allocations + * is not supposed to do this. + */ + /* + * Allocate and initialize the cursor for the leftward search. + */ + bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_BNO, 0, 0); + /* + * Lookup <= bno to find the leftward search's starting point. + */ + if ((error = xfs_alloc_lookup_le(bno_cur_lt, args->agbno, args->maxlen, &i))) + goto error0; + if (!i) { + /* + * Didn't find anything; use this cursor for the rightward + * search. + */ + bno_cur_gt = bno_cur_lt; + bno_cur_lt = 0; + } + /* + * Found something. Duplicate the cursor for the rightward search. + */ + else if ((error = xfs_btree_dup_cursor(bno_cur_lt, &bno_cur_gt))) + goto error0; + /* + * Increment the cursor, so we will point at the entry just right + * of the leftward entry if any, or to the leftmost entry. + */ + if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i))) + goto error0; + if (!i) { + /* + * It failed, there are no rightward entries. + */ + xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR); + bno_cur_gt = NULL; + } + /* + * Loop going left with the leftward cursor, right with the + * rightward cursor, until either both directions give up or + * we find an entry at least as big as minlen. + */ + do { + if (bno_cur_lt) { + if ((error = xfs_alloc_get_rec(bno_cur_lt, <bno, <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if (xfs_alloc_compute_aligned(ltbno, ltlen, + args->alignment, args->minlen, + <bnoa, <lena)) + break; + if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i))) + goto error0; + if (!i) { + xfs_btree_del_cursor(bno_cur_lt, + XFS_BTREE_NOERROR); + bno_cur_lt = NULL; + } + } + if (bno_cur_gt) { + if ((error = xfs_alloc_get_rec(bno_cur_gt, >bno, >len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if (xfs_alloc_compute_aligned(gtbno, gtlen, + args->alignment, args->minlen, + >bnoa, >lena)) + break; + if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i))) + goto error0; + if (!i) { + xfs_btree_del_cursor(bno_cur_gt, + XFS_BTREE_NOERROR); + bno_cur_gt = NULL; + } + } + } while (bno_cur_lt || bno_cur_gt); + /* + * Got both cursors still active, need to find better entry. + */ + if (bno_cur_lt && bno_cur_gt) { + /* + * Left side is long enough, look for a right side entry. + */ + if (ltlena >= args->minlen) { + /* + * Fix up the length. + */ + args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); + xfs_alloc_fix_len(args); + rlen = args->len; + ltdiff = xfs_alloc_compute_diff(args->agbno, rlen, + args->alignment, ltbno, ltlen, <new); + /* + * Not perfect. + */ + if (ltdiff) { + /* + * Look until we find a better one, run out of + * space, or run off the end. + */ + while (bno_cur_lt && bno_cur_gt) { + if ((error = xfs_alloc_get_rec( + bno_cur_gt, >bno, + >len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + xfs_alloc_compute_aligned(gtbno, gtlen, + args->alignment, args->minlen, + >bnoa, >lena); + /* + * The left one is clearly better. + */ + if (gtbnoa >= args->agbno + ltdiff) { + xfs_btree_del_cursor( + bno_cur_gt, + XFS_BTREE_NOERROR); + bno_cur_gt = NULL; + break; + } + /* + * If we reach a big enough entry, + * compare the two and pick the best. + */ + if (gtlena >= args->minlen) { + args->len = + XFS_EXTLEN_MIN(gtlena, + args->maxlen); + xfs_alloc_fix_len(args); + rlen = args->len; + gtdiff = xfs_alloc_compute_diff( + args->agbno, rlen, + args->alignment, + gtbno, gtlen, >new); + /* + * Right side is better. + */ + if (gtdiff < ltdiff) { + xfs_btree_del_cursor( + bno_cur_lt, + XFS_BTREE_NOERROR); + bno_cur_lt = NULL; + } + /* + * Left side is better. + */ + else { + xfs_btree_del_cursor( + bno_cur_gt, + XFS_BTREE_NOERROR); + bno_cur_gt = NULL; + } + break; + } + /* + * Fell off the right end. + */ + if ((error = xfs_alloc_increment( + bno_cur_gt, 0, &i))) + goto error0; + if (!i) { + xfs_btree_del_cursor( + bno_cur_gt, + XFS_BTREE_NOERROR); + bno_cur_gt = NULL; + break; + } + } + } + /* + * The left side is perfect, trash the right side. + */ + else { + xfs_btree_del_cursor(bno_cur_gt, + XFS_BTREE_NOERROR); + bno_cur_gt = NULL; + } + } + /* + * It's the right side that was found first, look left. + */ + else { + /* + * Fix up the length. + */ + args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); + xfs_alloc_fix_len(args); + rlen = args->len; + gtdiff = xfs_alloc_compute_diff(args->agbno, rlen, + args->alignment, gtbno, gtlen, >new); + /* + * Right side entry isn't perfect. + */ + if (gtdiff) { + /* + * Look until we find a better one, run out of + * space, or run off the end. + */ + while (bno_cur_lt && bno_cur_gt) { + if ((error = xfs_alloc_get_rec( + bno_cur_lt, <bno, + <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + xfs_alloc_compute_aligned(ltbno, ltlen, + args->alignment, args->minlen, + <bnoa, <lena); + /* + * The right one is clearly better. + */ + if (ltbnoa <= args->agbno - gtdiff) { + xfs_btree_del_cursor( + bno_cur_lt, + XFS_BTREE_NOERROR); + bno_cur_lt = NULL; + break; + } + /* + * If we reach a big enough entry, + * compare the two and pick the best. + */ + if (ltlena >= args->minlen) { + args->len = XFS_EXTLEN_MIN( + ltlena, args->maxlen); + xfs_alloc_fix_len(args); + rlen = args->len; + ltdiff = xfs_alloc_compute_diff( + args->agbno, rlen, + args->alignment, + ltbno, ltlen, <new); + /* + * Left side is better. + */ + if (ltdiff < gtdiff) { + xfs_btree_del_cursor( + bno_cur_gt, + XFS_BTREE_NOERROR); + bno_cur_gt = NULL; + } + /* + * Right side is better. + */ + else { + xfs_btree_del_cursor( + bno_cur_lt, + XFS_BTREE_NOERROR); + bno_cur_lt = NULL; + } + break; + } + /* + * Fell off the left end. + */ + if ((error = xfs_alloc_decrement( + bno_cur_lt, 0, &i))) + goto error0; + if (!i) { + xfs_btree_del_cursor(bno_cur_lt, + XFS_BTREE_NOERROR); + bno_cur_lt = NULL; + break; + } + } + } + /* + * The right side is perfect, trash the left side. + */ + else { + xfs_btree_del_cursor(bno_cur_lt, + XFS_BTREE_NOERROR); + bno_cur_lt = NULL; + } + } + } + /* + * If we couldn't get anything, give up. + */ + if (bno_cur_lt == NULL && bno_cur_gt == NULL) { + TRACE_ALLOC("neither", args); + args->agbno = NULLAGBLOCK; + return 0; + } + /* + * At this point we have selected a freespace entry, either to the + * left or to the right. If it's on the right, copy all the + * useful variables to the "left" set so we only have one + * copy of this code. + */ + if (bno_cur_gt) { + bno_cur_lt = bno_cur_gt; + bno_cur_gt = NULL; + ltbno = gtbno; + ltbnoa = gtbnoa; + ltlen = gtlen; + ltlena = gtlena; + j = 1; + } else + j = 0; + /* + * Fix up the length and compute the useful address. + */ + ltend = ltbno + ltlen; + args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); + xfs_alloc_fix_len(args); + if (!xfs_alloc_fix_minleft(args)) { + TRACE_ALLOC("nominleft", args); + xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + return 0; + } + rlen = args->len; + (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno, + ltlen, <new); + ASSERT(ltnew >= ltbno); + ASSERT(ltnew + rlen <= ltend); + ASSERT(ltnew + rlen <= INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length, + ARCH_CONVERT)); + args->agbno = ltnew; + if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, + ltnew, rlen, XFSA_FIXUP_BNO_OK))) + goto error0; + TRACE_ALLOC(j ? "gt" : "lt", args); + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); + return 0; + + error0: + TRACE_ALLOC("error", args); + if (cnt_cur != NULL) + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); + if (bno_cur_lt != NULL) + xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_ERROR); + if (bno_cur_gt != NULL) + xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_ERROR); + return error; +} + +/* + * Allocate a variable extent anywhere in the allocation group agno. + * Extent's length (returned in len) will be between minlen and maxlen, + * and of the form k * prod + mod unless there's nothing that large. + * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent_size( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_btree_cur_t *bno_cur; /* cursor for bno btree */ + xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */ + int error; /* error result */ + xfs_agblock_t fbno; /* start of found freespace */ + xfs_extlen_t flen; /* length of found freespace */ +#ifdef XFS_ALLOC_TRACE + static char fname[] = "xfs_alloc_ag_vextent_size"; +#endif + int i; /* temp status variable */ + xfs_agblock_t rbno; /* returned block number */ + xfs_extlen_t rlen; /* length of returned extent */ + + /* + * Allocate and initialize a cursor for the by-size btree. + */ + cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_CNT, 0, 0); + bno_cur = NULL; + /* + * Look for an entry >= maxlen+alignment-1 blocks. + */ + if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, + args->maxlen + args->alignment - 1, &i))) + goto error0; + /* + * If none, then pick up the last entry in the tree unless the + * tree is empty. + */ + if (!i) { + if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &fbno, + &flen, &i))) + goto error0; + if (i == 0 || flen == 0) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + TRACE_ALLOC("noentry", args); + return 0; + } + ASSERT(i == 1); + } + /* + * There's a freespace as big as maxlen+alignment-1, get it. + */ + else { + if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + } + /* + * In the first case above, we got the last entry in the + * by-size btree. Now we check to see if the space hits maxlen + * once aligned; if not, we search left for something better. + * This can't happen in the second case above. + */ + xfs_alloc_compute_aligned(fbno, flen, args->alignment, args->minlen, + &rbno, &rlen); + rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); + XFS_WANT_CORRUPTED_GOTO(rlen == 0 || + (rlen <= flen && rbno + rlen <= fbno + flen), error0); + if (rlen < args->maxlen) { + xfs_agblock_t bestfbno; + xfs_extlen_t bestflen; + xfs_agblock_t bestrbno; + xfs_extlen_t bestrlen; + + bestrlen = rlen; + bestrbno = rbno; + bestflen = flen; + bestfbno = fbno; + for (;;) { + if ((error = xfs_alloc_decrement(cnt_cur, 0, &i))) + goto error0; + if (i == 0) + break; + if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, + &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if (flen < bestrlen) + break; + xfs_alloc_compute_aligned(fbno, flen, args->alignment, + args->minlen, &rbno, &rlen); + rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); + XFS_WANT_CORRUPTED_GOTO(rlen == 0 || + (rlen <= flen && rbno + rlen <= fbno + flen), + error0); + if (rlen > bestrlen) { + bestrlen = rlen; + bestrbno = rbno; + bestflen = flen; + bestfbno = fbno; + if (rlen == args->maxlen) + break; + } + } + if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen, + &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + rlen = bestrlen; + rbno = bestrbno; + flen = bestflen; + fbno = bestfbno; + } + args->wasfromfl = 0; + /* + * Fix up the length. + */ + args->len = rlen; + xfs_alloc_fix_len(args); + if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + TRACE_ALLOC("nominleft", args); + args->agbno = NULLAGBLOCK; + return 0; + } + rlen = args->len; + XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0); + /* + * Allocate and initialize a cursor for the by-block tree. + */ + bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_BNO, 0, 0); + if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, + rbno, rlen, XFSA_FIXUP_CNT_OK))) + goto error0; + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + cnt_cur = bno_cur = NULL; + args->len = rlen; + args->agbno = rbno; + XFS_WANT_CORRUPTED_GOTO( + args->agbno + args->len <= + INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length, + ARCH_CONVERT), + error0); + TRACE_ALLOC("normal", args); + return 0; + +error0: + TRACE_ALLOC("error", args); + if (cnt_cur) + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); + if (bno_cur) + xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); + return error; +} + +/* + * Deal with the case where only small freespaces remain. + * Either return the contents of the last freespace record, + * or allocate space from the freelist if there is nothing in the tree. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent_small( + xfs_alloc_arg_t *args, /* allocation argument structure */ + xfs_btree_cur_t *ccur, /* by-size cursor */ + xfs_agblock_t *fbnop, /* result block number */ + xfs_extlen_t *flenp, /* result length */ + int *stat) /* status: 0-freelist, 1-normal/none */ +{ + int error; + xfs_agblock_t fbno; + xfs_extlen_t flen; +#ifdef XFS_ALLOC_TRACE + static char fname[] = "xfs_alloc_ag_vextent_small"; +#endif + int i; + + if ((error = xfs_alloc_decrement(ccur, 0, &i))) + goto error0; + if (i) { + if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + } + /* + * Nothing in the btree, try the freelist. Make sure + * to respect minleft even when pulling from the + * freelist. + */ + else if (args->minlen == 1 && args->alignment == 1 && !args->isfl && + (INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_flcount, + ARCH_CONVERT) > args->minleft)) { + if ((error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno))) + goto error0; + if (fbno != NULLAGBLOCK) { + if (args->userdata) { + xfs_buf_t *bp; + + bp = xfs_btree_get_bufs(args->mp, args->tp, + args->agno, fbno, 0); + xfs_trans_binval(args->tp, bp); + } + args->len = 1; + args->agbno = fbno; + XFS_WANT_CORRUPTED_GOTO( + args->agbno + args->len <= + INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length, + ARCH_CONVERT), + error0); + args->wasfromfl = 1; + TRACE_ALLOC("freelist", args); + *stat = 0; + return 0; + } + /* + * Nothing in the freelist. + */ + else + flen = 0; + } + /* + * Can't allocate from the freelist for some reason. + */ + else + flen = 0; + /* + * Can't do the allocation, give up. + */ + if (flen < args->minlen) { + args->agbno = NULLAGBLOCK; + TRACE_ALLOC("notenough", args); + flen = 0; + } + *fbnop = fbno; + *flenp = flen; + *stat = 1; + TRACE_ALLOC("normal", args); + return 0; + +error0: + TRACE_ALLOC("error", args); + return error; +} + +/* + * Free the extent starting at agno/bno for length. + */ +STATIC int /* error */ +xfs_free_ag_extent( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *agbp, /* buffer for a.g. freelist header */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t bno, /* starting block number */ + xfs_extlen_t len, /* length of extent */ + int isfl) /* set if is freelist blocks - no sb acctg */ +{ + xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */ + xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */ + int error; /* error return value */ +#ifdef XFS_ALLOC_TRACE + static char fname[] = "xfs_free_ag_extent"; +#endif + xfs_agblock_t gtbno; /* start of right neighbor block */ + xfs_extlen_t gtlen; /* length of right neighbor block */ + int haveleft; /* have a left neighbor block */ + int haveright; /* have a right neighbor block */ + int i; /* temp, result code */ + xfs_agblock_t ltbno; /* start of left neighbor block */ + xfs_extlen_t ltlen; /* length of left neighbor block */ + xfs_mount_t *mp; /* mount point struct for filesystem */ + xfs_agblock_t nbno; /* new starting block of freespace */ + xfs_extlen_t nlen; /* new length of freespace */ + + mp = tp->t_mountp; + /* + * Allocate and initialize a cursor for the by-block btree. + */ + bno_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO, 0, + 0); + cnt_cur = NULL; + /* + * Look for a neighboring block on the left (lower block numbers) + * that is contiguous with this space. + */ + if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft))) + goto error0; + if (haveleft) { + /* + * There is a block to our left. + */ + if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * It's not contiguous, though. + */ + if (ltbno + ltlen < bno) + haveleft = 0; + else { + /* + * If this failure happens the request to free this + * space was invalid, it's (partly) already free. + * Very bad. + */ + XFS_WANT_CORRUPTED_GOTO(ltbno + ltlen <= bno, error0); + } + } + /* + * Look for a neighboring block on the right (higher block numbers) + * that is contiguous with this space. + */ + if ((error = xfs_alloc_increment(bno_cur, 0, &haveright))) + goto error0; + if (haveright) { + /* + * There is a block to our right. + */ + if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * It's not contiguous, though. + */ + if (bno + len < gtbno) + haveright = 0; + else { + /* + * If this failure happens the request to free this + * space was invalid, it's (partly) already free. + * Very bad. + */ + XFS_WANT_CORRUPTED_GOTO(gtbno >= bno + len, error0); + } + } + /* + * Now allocate and initialize a cursor for the by-size tree. + */ + cnt_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT, 0, + 0); + /* + * Have both left and right contiguous neighbors. + * Merge all three into a single free block. + */ + if (haveleft && haveright) { + /* + * Delete the old by-size entry on the left. + */ + if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_alloc_delete(cnt_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * Delete the old by-size entry on the right. + */ + if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_alloc_delete(cnt_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * Delete the old by-block entry for the right block. + */ + if ((error = xfs_alloc_delete(bno_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * Move the by-block cursor back to the left neighbor. + */ + if ((error = xfs_alloc_decrement(bno_cur, 0, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); +#ifdef DEBUG + /* + * Check that this is the right record: delete didn't + * mangle the cursor. + */ + { + xfs_agblock_t xxbno; + xfs_extlen_t xxlen; + + if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen, + &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO( + i == 1 && xxbno == ltbno && xxlen == ltlen, + error0); + } +#endif + /* + * Update remaining by-block entry to the new, joined block. + */ + nbno = ltbno; + nlen = len + ltlen + gtlen; + if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) + goto error0; + } + /* + * Have only a left contiguous neighbor. + * Merge it together with the new freespace. + */ + else if (haveleft) { + /* + * Delete the old by-size entry on the left. + */ + if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_alloc_delete(cnt_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * Back up the by-block cursor to the left neighbor, and + * update its length. + */ + if ((error = xfs_alloc_decrement(bno_cur, 0, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + nbno = ltbno; + nlen = len + ltlen; + if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) + goto error0; + } + /* + * Have only a right contiguous neighbor. + * Merge it together with the new freespace. + */ + else if (haveright) { + /* + * Delete the old by-size entry on the right. + */ + if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_alloc_delete(cnt_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * Update the starting block and length of the right + * neighbor in the by-block tree. + */ + nbno = bno; + nlen = len + gtlen; + if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) + goto error0; + } + /* + * No contiguous neighbors. + * Insert the new freespace into the by-block tree. + */ + else { + nbno = bno; + nlen = len; + if ((error = xfs_alloc_insert(bno_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + } + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + bno_cur = NULL; + /* + * In all cases we need to insert the new freespace in the by-size tree. + */ + if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 0, error0); + if ((error = xfs_alloc_insert(cnt_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + cnt_cur = NULL; + /* + * Update the freespace totals in the ag and superblock. + */ + { + xfs_agf_t *agf; + xfs_perag_t *pag; /* per allocation group data */ + + agf = XFS_BUF_TO_AGF(agbp); + pag = &mp->m_perag[agno]; + INT_MOD(agf->agf_freeblks, ARCH_CONVERT, len); + xfs_trans_agblocks_delta(tp, len); + pag->pagf_freeblks += len; + XFS_WANT_CORRUPTED_GOTO( + INT_GET(agf->agf_freeblks, ARCH_CONVERT) + <= INT_GET(agf->agf_length, ARCH_CONVERT), + error0); + TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS); + xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS); + if (!isfl) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len); + XFS_STATS_INC(xfsstats.xs_freex); + XFS_STATS_ADD(xfsstats.xs_freeb, len); + } + TRACE_FREE(haveleft ? + (haveright ? "both" : "left") : + (haveright ? "right" : "none"), + agno, bno, len, isfl); + + /* + * Since blocks move to the free list without the coordination + * used in xfs_bmap_finish, we can't allow block to be available + * for reallocation and non-transaction writing (user data) + * until we know that the transaction that moved it to the free + * list is permanently on disk. We track the blocks by declaring + * these blocks as "busy"; the busy list is maintained on a per-ag + * basis and each transaction records which entries should be removed + * when the iclog commits to disk. If a busy block is allocated, + * the iclog is pushed up to the LSN that freed the block. + */ + xfs_alloc_mark_busy(tp, agno, bno, len); + return 0; + + error0: + TRACE_FREE("error", agno, bno, len, isfl); + if (bno_cur) + xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); + if (cnt_cur) + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); + return error; +} + +/* + * Visible (exported) allocation/free functions. + * Some of these are used just by xfs_alloc_btree.c and this file. + */ + +/* + * Compute and fill in value of m_ag_maxlevels. + */ +void +xfs_alloc_compute_maxlevels( + xfs_mount_t *mp) /* file system mount structure */ +{ + int level; + uint maxblocks; + uint maxleafents; + int minleafrecs; + int minnoderecs; + + maxleafents = (mp->m_sb.sb_agblocks + 1) / 2; + minleafrecs = mp->m_alloc_mnr[0]; + minnoderecs = mp->m_alloc_mnr[1]; + maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; + for (level = 1; maxblocks > 1; level++) + maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; + mp->m_ag_maxlevels = level; +} + +/* + * Decide whether to use this allocation group for this allocation. + * If so, fix up the btree freelist's size. + * This is external so mkfs can call it, too. + */ +int /* error */ +xfs_alloc_fix_freelist( + xfs_alloc_arg_t *args, /* allocation argument structure */ + int flags) /* XFS_ALLOC_FLAG_... */ +{ + xfs_buf_t *agbp; /* agf buffer pointer */ + xfs_agf_t *agf; /* a.g. freespace structure pointer */ + xfs_buf_t *agflbp;/* agfl buffer pointer */ + xfs_agblock_t bno; /* freelist block */ + xfs_extlen_t delta; /* new blocks needed in freelist */ + int error; /* error result code */ + xfs_extlen_t longest;/* longest extent in allocation group */ + xfs_mount_t *mp; /* file system mount point structure */ + xfs_extlen_t need; /* total blocks needed in freelist */ + xfs_perag_t *pag; /* per-ag information structure */ + xfs_alloc_arg_t targs; /* local allocation arguments */ + xfs_trans_t *tp; /* transaction pointer */ + + mp = args->mp; + + pag = args->pag; + tp = args->tp; + if (!pag->pagf_init) { + if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags, + &agbp))) + return error; + if (!pag->pagf_init) { + args->agbp = NULL; + return 0; + } + } else + agbp = NULL; + + /* If this is a metadata prefered pag and we are user data + * then try somewhere else if we are not being asked to + * try harder at this point + */ + if (pag->pagf_metadata && args->userdata && flags) { + args->agbp = NULL; + return 0; + } + + need = XFS_MIN_FREELIST_PAG(pag, mp); + delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0; + /* + * If it looks like there isn't a long enough extent, or enough + * total blocks, reject it. + */ + longest = (pag->pagf_longest > delta) ? + (pag->pagf_longest - delta) : + (pag->pagf_flcount > 0 || pag->pagf_longest > 0); + if (args->minlen + args->alignment + args->minalignslop - 1 > longest || + (args->minleft && + (int)(pag->pagf_freeblks + pag->pagf_flcount - + need - args->total) < + (int)args->minleft)) { + if (agbp) + xfs_trans_brelse(tp, agbp); + args->agbp = NULL; + return 0; + } + /* + * Get the a.g. freespace buffer. + * Can fail if we're not blocking on locks, and it's held. + */ + if (agbp == NULL) { + if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags, + &agbp))) + return error; + if (agbp == NULL) { + args->agbp = NULL; + return 0; + } + } + /* + * Figure out how many blocks we should have in the freelist. + */ + agf = XFS_BUF_TO_AGF(agbp); + need = XFS_MIN_FREELIST(agf, mp); + delta = need > INT_GET(agf->agf_flcount, ARCH_CONVERT) ? + (need - INT_GET(agf->agf_flcount, ARCH_CONVERT)) : 0; + /* + * If there isn't enough total or single-extent, reject it. + */ + longest = INT_GET(agf->agf_longest, ARCH_CONVERT); + longest = (longest > delta) ? (longest - delta) : + (INT_GET(agf->agf_flcount, ARCH_CONVERT) > 0 || longest > 0); + if (args->minlen + args->alignment + args->minalignslop - 1 > longest || + (args->minleft && + (int)(INT_GET(agf->agf_freeblks, ARCH_CONVERT) + + INT_GET(agf->agf_flcount, ARCH_CONVERT) - need - args->total) < + (int)args->minleft)) { + xfs_trans_brelse(tp, agbp); + args->agbp = NULL; + return 0; + } + /* + * Make the freelist shorter if it's too long. + */ + while (INT_GET(agf->agf_flcount, ARCH_CONVERT) > need) { + xfs_buf_t *bp; + + if ((error = xfs_alloc_get_freelist(tp, agbp, &bno))) + return error; + if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1))) + return error; + bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); + xfs_trans_binval(tp, bp); + } + /* + * Initialize the args structure. + */ + targs.tp = tp; + targs.mp = mp; + targs.agbp = agbp; + targs.agno = args->agno; + targs.mod = targs.minleft = targs.wasdel = targs.userdata = + targs.minalignslop = 0; + targs.alignment = targs.minlen = targs.prod = targs.isfl = 1; + targs.type = XFS_ALLOCTYPE_THIS_AG; + targs.pag = pag; + if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp))) + return error; + /* + * Make the freelist longer if it's too short. + */ + while (INT_GET(agf->agf_flcount, ARCH_CONVERT) < need) { + targs.agbno = 0; + targs.maxlen = need - INT_GET(agf->agf_flcount, ARCH_CONVERT); + /* + * Allocate as many blocks as possible at once. + */ + if ((error = xfs_alloc_ag_vextent(&targs))) + return error; + /* + * Stop if we run out. Won't happen if callers are obeying + * the restrictions correctly. Can happen for free calls + * on a completely full ag. + */ + if (targs.agbno == NULLAGBLOCK) + break; + /* + * Put each allocated block on the list. + */ + for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) { + if ((error = xfs_alloc_put_freelist(tp, agbp, agflbp, + bno))) + return error; + } + } + args->agbp = agbp; + return 0; +} + +/* + * Get a block from the freelist. + * Returns with the buffer for the block gotten. + */ +int /* error */ +xfs_alloc_get_freelist( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *agbp, /* buffer containing the agf structure */ + xfs_agblock_t *bnop) /* block address retrieved from freelist */ +{ + xfs_agf_t *agf; /* a.g. freespace structure */ + xfs_agfl_t *agfl; /* a.g. freelist structure */ + xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */ + xfs_agblock_t bno; /* block number returned */ + int error; +#ifdef XFS_ALLOC_TRACE + static char fname[] = "xfs_alloc_get_freelist"; +#endif + xfs_mount_t *mp; /* mount structure */ + xfs_perag_t *pag; /* per allocation group data */ + + agf = XFS_BUF_TO_AGF(agbp); + /* + * Freelist is empty, give up. + */ + if (INT_ISZERO(agf->agf_flcount, ARCH_CONVERT)) { + *bnop = NULLAGBLOCK; + return 0; + } + /* + * Read the array of free blocks. + */ + mp = tp->t_mountp; + if ((error = xfs_alloc_read_agfl(mp, tp, + INT_GET(agf->agf_seqno, ARCH_CONVERT), &agflbp))) + return error; + agfl = XFS_BUF_TO_AGFL(agflbp); + /* + * Get the block number and update the data structures. + */ + bno = INT_GET(agfl->agfl_bno[INT_GET(agf->agf_flfirst, ARCH_CONVERT)], ARCH_CONVERT); + INT_MOD(agf->agf_flfirst, ARCH_CONVERT, 1); + xfs_trans_brelse(tp, agflbp); + if (INT_GET(agf->agf_flfirst, ARCH_CONVERT) == XFS_AGFL_SIZE) + INT_ZERO(agf->agf_flfirst, ARCH_CONVERT); + pag = &mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)]; + INT_MOD(agf->agf_flcount, ARCH_CONVERT, -1); + xfs_trans_agflist_delta(tp, -1); + pag->pagf_flcount--; + TRACE_MODAGF(NULL, agf, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT); + xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT); + *bnop = bno; + + /* + * As blocks are freed, they are added to the per-ag busy list + * and remain there until the freeing transaction is committed to + * disk. Now that we have allocated blocks, this list must be + * searched to see if a block is being reused. If one is, then + * the freeing transaction must be pushed to disk NOW by forcing + * to disk all iclogs up that transaction's LSN. + */ + xfs_alloc_search_busy(tp, INT_GET(agf->agf_seqno, ARCH_CONVERT), bno, 1); + return 0; +} + +/* + * Log the given fields from the agf structure. + */ +void +xfs_alloc_log_agf( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *bp, /* buffer for a.g. freelist header */ + int fields) /* mask of fields to be logged (XFS_AGF_...) */ +{ + int first; /* first byte offset */ + int last; /* last byte offset */ + static const short offsets[] = { + offsetof(xfs_agf_t, agf_magicnum), + offsetof(xfs_agf_t, agf_versionnum), + offsetof(xfs_agf_t, agf_seqno), + offsetof(xfs_agf_t, agf_length), + offsetof(xfs_agf_t, agf_roots[0]), + offsetof(xfs_agf_t, agf_levels[0]), + offsetof(xfs_agf_t, agf_flfirst), + offsetof(xfs_agf_t, agf_fllast), + offsetof(xfs_agf_t, agf_flcount), + offsetof(xfs_agf_t, agf_freeblks), + offsetof(xfs_agf_t, agf_longest), + sizeof(xfs_agf_t) + }; + + xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last); + xfs_trans_log_buf(tp, bp, (uint)first, (uint)last); +} + +/* + * Interface for inode allocation to force the pag data to be initialized. + */ +int /* error */ +xfs_alloc_pagf_init( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags) /* XFS_ALLOC_FLAGS_... */ +{ + xfs_buf_t *bp; + int error; + + if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp))) + return error; + if (bp) + xfs_trans_brelse(tp, bp); + return 0; +} + +/* + * Put the block on the freelist for the allocation group. + */ +int /* error */ +xfs_alloc_put_freelist( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *agbp, /* buffer for a.g. freelist header */ + xfs_buf_t *agflbp,/* buffer for a.g. free block array */ + xfs_agblock_t bno) /* block being freed */ +{ + xfs_agf_t *agf; /* a.g. freespace structure */ + xfs_agfl_t *agfl; /* a.g. free block array */ + xfs_agblock_t *blockp;/* pointer to array entry */ + int error; +#ifdef XFS_ALLOC_TRACE + static char fname[] = "xfs_alloc_put_freelist"; +#endif + xfs_mount_t *mp; /* mount structure */ + xfs_perag_t *pag; /* per allocation group data */ + + agf = XFS_BUF_TO_AGF(agbp); + mp = tp->t_mountp; + + if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp, + INT_GET(agf->agf_seqno, ARCH_CONVERT), &agflbp))) + return error; + agfl = XFS_BUF_TO_AGFL(agflbp); + INT_MOD(agf->agf_fllast, ARCH_CONVERT, 1); + if (INT_GET(agf->agf_fllast, ARCH_CONVERT) == XFS_AGFL_SIZE) + INT_ZERO(agf->agf_fllast, ARCH_CONVERT); + pag = &mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)]; + INT_MOD(agf->agf_flcount, ARCH_CONVERT, 1); + xfs_trans_agflist_delta(tp, 1); + pag->pagf_flcount++; + ASSERT(INT_GET(agf->agf_flcount, ARCH_CONVERT) <= XFS_AGFL_SIZE); + blockp = &agfl->agfl_bno[INT_GET(agf->agf_fllast, ARCH_CONVERT)]; + INT_SET(*blockp, ARCH_CONVERT, bno); + TRACE_MODAGF(NULL, agf, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT); + xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT); + xfs_trans_log_buf(tp, agflbp, + (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl), + (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl + + sizeof(xfs_agblock_t) - 1)); + /* + * Since blocks move to the free list without the coordination + * used in xfs_bmap_finish, we can't allow block to be available + * for reallocation and non-transaction writing (user data) + * until we know that the transaction that moved it to the free + * list is permanently on disk. We track the blocks by declaring + * these blocks as "busy"; the busy list is maintained on a per-ag + * basis and each transaction records which entries should be removed + * when the iclog commits to disk. If a busy block is allocated, + * the iclog is pushed up to the LSN that freed the block. + */ + xfs_alloc_mark_busy(tp, INT_GET(agf->agf_seqno, ARCH_CONVERT), bno, 1); + + return 0; +} + +/* + * Read in the allocation group header (free/alloc section). + */ +int /* error */ +xfs_alloc_read_agf( + xfs_mount_t *mp, /* mount point structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags, /* XFS_ALLOC_FLAG_... */ + xfs_buf_t **bpp) /* buffer for the ag freelist header */ +{ + xfs_agf_t *agf; /* ag freelist header */ + int agf_ok; /* set if agf is consistent */ + xfs_buf_t *bp; /* return value */ + xfs_daddr_t d; /* disk block address */ + int error; + xfs_perag_t *pag; /* per allocation group data */ + + ASSERT(agno != NULLAGNUMBER); + d = XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR); + if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, 1, + (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0U, + &bp))) + return error; + ASSERT(!bp || !XFS_BUF_GETERROR(bp)); + if (!bp) { + *bpp = NULL; + return 0; + } + /* + * Validate the magic number of the agf block. + */ + agf = XFS_BUF_TO_AGF(bp); + agf_ok = + INT_GET(agf->agf_magicnum, ARCH_CONVERT) == XFS_AGF_MAGIC && + XFS_AGF_GOOD_VERSION(INT_GET(agf->agf_versionnum, ARCH_CONVERT)) && + INT_GET(agf->agf_freeblks, ARCH_CONVERT) <= + INT_GET(agf->agf_length, ARCH_CONVERT) && + INT_GET(agf->agf_flfirst, ARCH_CONVERT) < XFS_AGFL_SIZE && + INT_GET(agf->agf_fllast, ARCH_CONVERT) < XFS_AGFL_SIZE && + INT_GET(agf->agf_flcount, ARCH_CONVERT) <= XFS_AGFL_SIZE; + if (XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, + XFS_RANDOM_ALLOC_READ_AGF)) { + xfs_trans_brelse(tp, bp); +#ifdef __KERNEL__ /* additional, temporary, debugging code */ + cmn_err(CE_NOTE, + "xfs_alloc_read_agf: error in <%s> AG %d", + mp->m_fsname, agno); + if (INT_GET(agf->agf_magicnum, ARCH_CONVERT) != XFS_AGF_MAGIC) + cmn_err(CE_NOTE, "bad agf_magicnum 0x%x", + INT_GET(agf->agf_magicnum, ARCH_CONVERT)); + if (!XFS_AGF_GOOD_VERSION(INT_GET(agf->agf_versionnum, ARCH_CONVERT))) + cmn_err(CE_NOTE, "Bad version number 0x%x", + INT_GET(agf->agf_versionnum, ARCH_CONVERT)); + if (!(INT_GET(agf->agf_freeblks, ARCH_CONVERT) <= + INT_GET(agf->agf_length, ARCH_CONVERT))) + cmn_err(CE_NOTE, "Bad freeblks %d %d", + INT_GET(agf->agf_freeblks, ARCH_CONVERT), + INT_GET(agf->agf_length, ARCH_CONVERT)); + if (!(INT_GET(agf->agf_flfirst, ARCH_CONVERT) < XFS_AGFL_SIZE)) + cmn_err(CE_NOTE, "Bad flfirst %d", + INT_GET(agf->agf_flfirst, ARCH_CONVERT)); + if (!(INT_GET(agf->agf_fllast, ARCH_CONVERT) < XFS_AGFL_SIZE)) + cmn_err(CE_NOTE, "Bad fllast %d", + INT_GET(agf->agf_fllast, ARCH_CONVERT)); + if (!(INT_GET(agf->agf_flcount, ARCH_CONVERT) <= XFS_AGFL_SIZE)) + cmn_err(CE_NOTE, "Bad flcount %d", + INT_GET(agf->agf_flcount, ARCH_CONVERT)); +#endif + return XFS_ERROR(EFSCORRUPTED); + } + pag = &mp->m_perag[agno]; + if (!pag->pagf_init) { + pag->pagf_freeblks = INT_GET(agf->agf_freeblks, ARCH_CONVERT); + pag->pagf_flcount = INT_GET(agf->agf_flcount, ARCH_CONVERT); + pag->pagf_longest = INT_GET(agf->agf_longest, ARCH_CONVERT); + pag->pagf_levels[XFS_BTNUM_BNOi] = + INT_GET(agf->agf_levels[XFS_BTNUM_BNOi], ARCH_CONVERT); + pag->pagf_levels[XFS_BTNUM_CNTi] = + INT_GET(agf->agf_levels[XFS_BTNUM_CNTi], ARCH_CONVERT); + spinlock_init(&pag->pagb_lock, "xfspagb"); + pag->pagb_list = kmem_zalloc(XFS_PAGB_NUM_SLOTS * + sizeof(xfs_perag_busy_t), KM_SLEEP); + pag->pagf_init = 1; + } +#ifdef DEBUG + else if (!XFS_FORCED_SHUTDOWN(mp)) { + ASSERT(pag->pagf_freeblks == INT_GET(agf->agf_freeblks, ARCH_CONVERT)); + ASSERT(pag->pagf_flcount == INT_GET(agf->agf_flcount, ARCH_CONVERT)); + ASSERT(pag->pagf_longest == INT_GET(agf->agf_longest, ARCH_CONVERT)); + ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] == + INT_GET(agf->agf_levels[XFS_BTNUM_BNOi], ARCH_CONVERT)); + ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] == + INT_GET(agf->agf_levels[XFS_BTNUM_CNTi], ARCH_CONVERT)); + } +#endif + XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGF, XFS_AGF_REF); + *bpp = bp; + return 0; +} + +/* + * Allocate an extent (variable-size). + * Depending on the allocation type, we either look in a single allocation + * group or loop over the allocation groups to find the result. + */ +int /* error */ +xfs_alloc_vextent( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_agblock_t agsize; /* allocation group size */ + int error; + int flags; /* XFS_ALLOC_FLAG_... locking flags */ +#ifdef XFS_ALLOC_TRACE + static char fname[] = "xfs_alloc_vextent"; +#endif + xfs_extlen_t minleft;/* minimum left value, temp copy */ + xfs_mount_t *mp; /* mount structure pointer */ + xfs_agnumber_t sagno; /* starting allocation group number */ + xfs_alloctype_t type; /* input allocation type */ + int bump_rotor = 0; + int no_min = 0; + + mp = args->mp; + type = args->otype = args->type; + args->agbno = NULLAGBLOCK; + /* + * Just fix this up, for the case where the last a.g. is shorter + * (or there's only one a.g.) and the caller couldn't easily figure + * that out (xfs_bmap_alloc). + */ + agsize = mp->m_sb.sb_agblocks; + if (args->maxlen > agsize) + args->maxlen = agsize; + if (args->alignment == 0) + args->alignment = 1; + ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize); + ASSERT(args->minlen <= args->maxlen); + ASSERT(args->minlen <= agsize); + ASSERT(args->mod < args->prod); + if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount || + XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize || + args->minlen > args->maxlen || args->minlen > agsize || + args->mod >= args->prod) { + args->fsbno = NULLFSBLOCK; + TRACE_ALLOC("badargs", args); + return 0; + } + minleft = args->minleft; + + switch (type) { + case XFS_ALLOCTYPE_THIS_AG: + case XFS_ALLOCTYPE_NEAR_BNO: + case XFS_ALLOCTYPE_THIS_BNO: + /* + * These three force us into a single a.g. + */ + args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); + down_read(&mp->m_peraglock); + args->pag = &mp->m_perag[args->agno]; + args->minleft = 0; + error = xfs_alloc_fix_freelist(args, 0); + args->minleft = minleft; + if (error) { + TRACE_ALLOC("nofix", args); + goto error0; + } + if (!args->agbp) { + up_read(&mp->m_peraglock); + TRACE_ALLOC("noagbp", args); + break; + } + args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); + if ((error = xfs_alloc_ag_vextent(args))) + goto error0; + up_read(&mp->m_peraglock); + break; + case XFS_ALLOCTYPE_START_BNO: + /* + * Try near allocation first, then anywhere-in-ag after + * the first a.g. fails. + */ + if ((args->userdata == XFS_ALLOC_INITIAL_USER_DATA) && + (mp->m_flags & XFS_MOUNT_32BITINODES)) { + args->fsbno = XFS_AGB_TO_FSB(mp, mp->m_agfrotor, 0); + bump_rotor = 1; + } + args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); + args->type = XFS_ALLOCTYPE_NEAR_BNO; + /* FALLTHROUGH */ + case XFS_ALLOCTYPE_ANY_AG: + case XFS_ALLOCTYPE_START_AG: + case XFS_ALLOCTYPE_FIRST_AG: + /* + * Rotate through the allocation groups looking for a winner. + */ + if (type == XFS_ALLOCTYPE_ANY_AG) { + /* + * Start with the last place we left off. + */ + args->agno = sagno = mp->m_agfrotor; + args->type = XFS_ALLOCTYPE_THIS_AG; + flags = XFS_ALLOC_FLAG_TRYLOCK; + } else if (type == XFS_ALLOCTYPE_FIRST_AG) { + /* + * Start with allocation group given by bno. + */ + args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); + args->type = XFS_ALLOCTYPE_THIS_AG; + sagno = 0; + flags = 0; + } else { + if (type == XFS_ALLOCTYPE_START_AG) + args->type = XFS_ALLOCTYPE_THIS_AG; + /* + * Start with the given allocation group. + */ + args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno); + flags = XFS_ALLOC_FLAG_TRYLOCK; + } + /* + * Loop over allocation groups twice; first time with + * trylock set, second time without. + */ + down_read(&mp->m_peraglock); + for (;;) { + args->pag = &mp->m_perag[args->agno]; + if (no_min) args->minleft = 0; + error = xfs_alloc_fix_freelist(args, flags); + args->minleft = minleft; + if (error) { + TRACE_ALLOC("nofix", args); + goto error0; + } + /* + * If we get a buffer back then the allocation will fly. + */ + if (args->agbp) { + if ((error = xfs_alloc_ag_vextent(args))) + goto error0; + break; + } + TRACE_ALLOC("loopfailed", args); + /* + * Didn't work, figure out the next iteration. + */ + if (args->agno == sagno && + type == XFS_ALLOCTYPE_START_BNO) + args->type = XFS_ALLOCTYPE_THIS_AG; + if (++(args->agno) == mp->m_sb.sb_agcount) + args->agno = 0; + /* + * Reached the starting a.g., must either be done + * or switch to non-trylock mode. + */ + if (args->agno == sagno) { + if (no_min == 1) { + args->agbno = NULLAGBLOCK; + TRACE_ALLOC("allfailed", args); + break; + } + if (flags == 0) { + no_min = 1; + } else { + flags = 0; + if (type == XFS_ALLOCTYPE_START_BNO) { + args->agbno = XFS_FSB_TO_AGBNO(mp, + args->fsbno); + args->type = XFS_ALLOCTYPE_NEAR_BNO; + } + } + } + } + up_read(&mp->m_peraglock); + if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) + mp->m_agfrotor = (args->agno + 1) % mp->m_sb.sb_agcount; + break; + default: + ASSERT(0); + /* NOTREACHED */ + } + if (args->agbno == NULLAGBLOCK) + args->fsbno = NULLFSBLOCK; + else { + args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno); +#ifdef DEBUG + ASSERT(args->len >= args->minlen); + ASSERT(args->len <= args->maxlen); + ASSERT(args->agbno % args->alignment == 0); + XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), + args->len); +#endif + } + return 0; +error0: + up_read(&mp->m_peraglock); + return error; +} + +/* + * Free an extent. + * Just break up the extent address and hand off to xfs_free_ag_extent + * after fixing up the freelist. + */ +int /* error */ +xfs_free_extent( + xfs_trans_t *tp, /* transaction pointer */ + xfs_fsblock_t bno, /* starting block number of extent */ + xfs_extlen_t len) /* length of extent */ +{ +#ifdef DEBUG + xfs_agf_t *agf; /* a.g. freespace header */ +#endif + xfs_alloc_arg_t args; /* allocation argument structure */ + int error; + + ASSERT(len != 0); + args.tp = tp; + args.mp = tp->t_mountp; + args.agno = XFS_FSB_TO_AGNO(args.mp, bno); + ASSERT(args.agno < args.mp->m_sb.sb_agcount); + args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); + args.alignment = 1; + args.minlen = args.minleft = args.minalignslop = 0; + down_read(&args.mp->m_peraglock); + args.pag = &args.mp->m_perag[args.agno]; + if ((error = xfs_alloc_fix_freelist(&args, 0))) + goto error0; +#ifdef DEBUG + ASSERT(args.agbp != NULL); + agf = XFS_BUF_TO_AGF(args.agbp); + ASSERT(args.agbno + len <= INT_GET(agf->agf_length, ARCH_CONVERT)); +#endif + error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, + len, 0); +error0: + up_read(&args.mp->m_peraglock); + return error; +} + + +/* + * AG Busy list management + * The busy list contains block ranges that have been freed but whose + * transacations have not yet hit disk. If any block listed in a busy + * list is reused, the transaction that freed it must be forced to disk + * before continuing to use the block. + * + * xfs_alloc_mark_busy - add to the per-ag busy list + * xfs_alloc_clear_busy - remove an item from the per-ag busy list + */ +void +xfs_alloc_mark_busy(xfs_trans_t *tp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) +{ + xfs_mount_t *mp; + xfs_perag_busy_t *bsy; + int n; + SPLDECL(s); + + mp = tp->t_mountp; + s = mutex_spinlock(&mp->m_perag[agno].pagb_lock); + + /* search pagb_list for an open slot */ + for (bsy = mp->m_perag[agno].pagb_list, n = 0; + n < XFS_PAGB_NUM_SLOTS; + bsy++, n++) { + if (bsy->busy_tp == NULL) { + break; + } + } + + if (n < XFS_PAGB_NUM_SLOTS) { + bsy = &mp->m_perag[agno].pagb_list[n]; + mp->m_perag[agno].pagb_count++; + TRACE_BUSY("xfs_alloc_mark_busy", "got", agno, bno, len, n, tp); + bsy->busy_start = bno; + bsy->busy_length = len; + bsy->busy_tp = tp; + xfs_trans_add_busy(tp, agno, n); + } else { + TRACE_BUSY("xfs_alloc_mark_busy", "FULL", agno, bno, len, -1, tp); + /* + * The busy list is full! Since it is now not possible to + * track the free block, make this a synchronous transaction + * to insure that the block is not reused before this + * transaction commits. + */ + xfs_trans_set_sync(tp); + } + + mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s); +} + +void +xfs_alloc_clear_busy(xfs_trans_t *tp, + xfs_agnumber_t agno, + int idx) +{ + xfs_mount_t *mp; + xfs_perag_busy_t *list; + SPLDECL(s); + + mp = tp->t_mountp; + + s = mutex_spinlock(&mp->m_perag[agno].pagb_lock); + list = mp->m_perag[agno].pagb_list; + + ASSERT(idx < XFS_PAGB_NUM_SLOTS); + if (list[idx].busy_tp == tp) { + TRACE_UNBUSY("xfs_alloc_clear_busy", "found", agno, idx, tp); + list[idx].busy_tp = NULL; + mp->m_perag[agno].pagb_count--; + } else { + TRACE_UNBUSY("xfs_alloc_clear_busy", "missing", agno, idx, tp); + } + + mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s); +} + + +/* + * returns non-zero if any of (agno,bno):len is in a busy list + */ +int +xfs_alloc_search_busy(xfs_trans_t *tp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) +{ + xfs_mount_t *mp; + xfs_perag_busy_t *bsy; + int n; + xfs_agblock_t uend, bend; + xfs_lsn_t lsn; + int cnt; + SPLDECL(s); + + mp = tp->t_mountp; + + s = mutex_spinlock(&mp->m_perag[agno].pagb_lock); + cnt = mp->m_perag[agno].pagb_count; + + uend = bno + len; + + /* search pagb_list for this slot, skipping open slots */ + for (bsy = mp->m_perag[agno].pagb_list, n = 0; + cnt; bsy++, n++) { + + /* + * (start1,length1) within (start2, length2) + */ + if (bsy->busy_tp != NULL) { + bend = bsy->busy_start + bsy->busy_length; + if ( (bno >= bsy->busy_start && bno <= bend) || + (uend >= bsy->busy_start && uend <= bend) || + (bno <= bsy->busy_start && uend >= bsy->busy_start) ) { + TRACE_BUSYSEARCH("xfs_alloc_search_busy", + "found1", agno, bno, len, n, + tp); + break; + } + cnt--; + } + } + + /* + * If a block was found, force the log through the LSN of the + * transaction that freed the block + */ + if (cnt) { + TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, n, tp); + lsn = bsy->busy_tp->t_lsn; + mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s); + xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC); + } else { + TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, n, tp); + n = -1; + mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s); + } + + return n; +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_alloc.h linux-2.4-xfs/fs/xfs/xfs_alloc.h --- linux-2.4.19/fs/xfs/xfs_alloc.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_alloc.h Wed Jul 10 23:13:50 2002 @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_ALLOC_H__ +#define __XFS_ALLOC_H__ + +struct xfs_buf; +struct xfs_mount; +struct xfs_perag; +struct xfs_trans; + +/* + * Freespace allocation types. Argument to xfs_alloc_[v]extent. + */ +typedef enum xfs_alloctype +{ + XFS_ALLOCTYPE_ANY_AG, /* allocate anywhere, use rotor */ + XFS_ALLOCTYPE_FIRST_AG, /* ... start at ag 0 */ + XFS_ALLOCTYPE_START_AG, /* anywhere, start in this a.g. */ + XFS_ALLOCTYPE_THIS_AG, /* anywhere in this a.g. */ + XFS_ALLOCTYPE_START_BNO, /* near this block else anywhere */ + XFS_ALLOCTYPE_NEAR_BNO, /* in this a.g. and near this block */ + XFS_ALLOCTYPE_THIS_BNO /* at exactly this block */ +} xfs_alloctype_t; + +/* + * Flags for xfs_alloc_fix_freelist. + */ +#define XFS_ALLOC_FLAG_TRYLOCK 0x00000001 /* use trylock for buffer locking */ + +/* + * Argument structure for xfs_alloc routines. + * This is turned into a structure to avoid having 20 arguments passed + * down several levels of the stack. + */ +typedef struct xfs_alloc_arg { + struct xfs_trans *tp; /* transaction pointer */ + struct xfs_mount *mp; /* file system mount point */ + struct xfs_buf *agbp; /* buffer for a.g. freelist header */ + struct xfs_perag *pag; /* per-ag struct for this agno */ + xfs_fsblock_t fsbno; /* file system block number */ + xfs_agnumber_t agno; /* allocation group number */ + xfs_agblock_t agbno; /* allocation group-relative block # */ + xfs_extlen_t minlen; /* minimum size of extent */ + xfs_extlen_t maxlen; /* maximum size of extent */ + xfs_extlen_t mod; /* mod value for extent size */ + xfs_extlen_t prod; /* prod value for extent size */ + xfs_extlen_t minleft; /* min blocks must be left after us */ + xfs_extlen_t total; /* total blocks needed in xaction */ + xfs_extlen_t alignment; /* align answer to multiple of this */ + xfs_extlen_t minalignslop; /* slop for minlen+alignment calcs */ + xfs_extlen_t len; /* output: actual size of extent */ + xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */ + xfs_alloctype_t otype; /* original allocation type */ + char wasdel; /* set if allocation was prev delayed */ + char wasfromfl; /* set if allocation is from freelist */ + char isfl; /* set if is freelist blocks - !actg */ + char userdata; /* set if this is user data */ +} xfs_alloc_arg_t; + +/* + * Defines for userdata + */ +#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/ +#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */ + + +#ifdef __KERNEL__ + +/* + * Types for alloc tracing. + */ +#define XFS_ALLOC_KTRACE_ALLOC 1 +#define XFS_ALLOC_KTRACE_FREE 2 +#define XFS_ALLOC_KTRACE_MODAGF 3 +#define XFS_ALLOC_KTRACE_BUSY 4 +#define XFS_ALLOC_KTRACE_UNBUSY 5 +#define XFS_ALLOC_KTRACE_BUSYSEARCH 6 + + +/* + * Allocation tracing buffer size. + */ +#define XFS_ALLOC_TRACE_SIZE 4096 + +#ifdef XFS_ALL_TRACE +#define XFS_ALLOC_TRACE +#endif + +#if !defined(DEBUG) +#undef XFS_ALLOC_TRACE +#endif + +/* + * Prototypes for visible xfs_alloc.c routines + */ + +/* + * Compute and fill in value of m_ag_maxlevels. + */ +void +xfs_alloc_compute_maxlevels( + struct xfs_mount *mp); /* file system mount structure */ + +/* + * Decide whether to use this allocation group for this allocation. + * If so, fix up the btree freelist's size. + * This is external so mkfs can call it, too. + */ +int /* error */ +xfs_alloc_fix_freelist( + xfs_alloc_arg_t *args, /* allocation argument structure */ + int flags); /* XFS_ALLOC_FLAG_... */ + +/* + * Get a block from the freelist. + * Returns with the buffer for the block gotten. + */ +int /* error */ +xfs_alloc_get_freelist( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* buffer containing the agf structure */ + xfs_agblock_t *bnop); /* block address retrieved from freelist */ + +/* + * Log the given fields from the agf structure. + */ +void +xfs_alloc_log_agf( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *bp, /* buffer for a.g. freelist header */ + int fields);/* mask of fields to be logged (XFS_AGF_...) */ + +/* + * Interface for inode allocation to force the pag data to be initialized. + */ +int /* error */ +xfs_alloc_pagf_init( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags); /* XFS_ALLOC_FLAGS_... */ + +/* + * Put the block on the freelist for the allocation group. + */ +int /* error */ +xfs_alloc_put_freelist( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* buffer for a.g. freelist header */ + struct xfs_buf *agflbp,/* buffer for a.g. free block array */ + xfs_agblock_t bno); /* block being freed */ + +/* + * Read in the allocation group header (free/alloc section). + */ +int /* error */ +xfs_alloc_read_agf( + struct xfs_mount *mp, /* mount point structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags, /* XFS_ALLOC_FLAG_... */ + struct xfs_buf **bpp); /* buffer for the ag freelist header */ + +/* + * Allocate an extent (variable-size). + */ +int /* error */ +xfs_alloc_vextent( + xfs_alloc_arg_t *args); /* allocation argument structure */ + +/* + * Free an extent. + */ +int /* error */ +xfs_free_extent( + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t bno, /* starting block number of extent */ + xfs_extlen_t len); /* length of extent */ + +void +xfs_alloc_mark_busy(xfs_trans_t *tp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len); + +void +xfs_alloc_clear_busy(xfs_trans_t *tp, + xfs_agnumber_t ag, + int idx); + + +#endif /* __KERNEL__ */ + +#endif /* __XFS_ALLOC_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_alloc_btree.c linux-2.4-xfs/fs/xfs/xfs_alloc_btree.c --- linux-2.4.19/fs/xfs/xfs_alloc_btree.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_alloc_btree.c Wed Jul 10 23:13:50 2002 @@ -0,0 +1,2155 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * Free space allocation for XFS. + */ + +#include + +/* + * Prototypes for internal functions. + */ + +STATIC void xfs_alloc_log_block(xfs_trans_t *, xfs_buf_t *, int); +STATIC void xfs_alloc_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int); +STATIC void xfs_alloc_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int); +STATIC void xfs_alloc_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int); +STATIC int xfs_alloc_lshift(xfs_btree_cur_t *, int, int *); +STATIC int xfs_alloc_newroot(xfs_btree_cur_t *, int *); +STATIC int xfs_alloc_rshift(xfs_btree_cur_t *, int, int *); +STATIC int xfs_alloc_split(xfs_btree_cur_t *, int, xfs_agblock_t *, + xfs_alloc_key_t *, xfs_btree_cur_t **, int *); +STATIC int xfs_alloc_updkey(xfs_btree_cur_t *, xfs_alloc_key_t *, int); + +/* + * Internal functions. + */ + +/* + * Single level of the xfs_alloc_delete record deletion routine. + * Delete record pointed to by cur/level. + * Remove the record from its block then rebalance the tree. + * Return 0 for error, 1 for done, 2 to go on to the next level. + */ +STATIC int /* error */ +xfs_alloc_delrec( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level removing record from */ + int *stat) /* fail/done/go-on */ +{ + xfs_agf_t *agf; /* allocation group freelist header */ + xfs_alloc_block_t *block; /* btree block record/key lives in */ + xfs_agblock_t bno; /* btree block number */ + xfs_buf_t *bp; /* buffer for block */ + int error; /* error return value */ + int i; /* loop index */ + xfs_alloc_key_t key; /* kp points here if block is level 0 */ + xfs_agblock_t lbno; /* left block's block number */ + xfs_buf_t *lbp; /* left block's buffer pointer */ + xfs_alloc_block_t *left; /* left btree block */ + xfs_alloc_key_t *lkp=NULL; /* left block key pointer */ + xfs_alloc_ptr_t *lpp=NULL; /* left block address pointer */ + int lrecs=0; /* number of records in left block */ + xfs_alloc_rec_t *lrp; /* left block record pointer */ + xfs_mount_t *mp; /* mount structure */ + int ptr; /* index in btree block for this rec */ + xfs_agblock_t rbno; /* right block's block number */ + xfs_buf_t *rbp; /* right block's buffer pointer */ + xfs_alloc_block_t *right; /* right btree block */ + xfs_alloc_key_t *rkp; /* right block key pointer */ + xfs_alloc_ptr_t *rpp; /* right block address pointer */ + int rrecs=0; /* number of records in right block */ + xfs_alloc_rec_t *rrp; /* right block record pointer */ + xfs_btree_cur_t *tcur; /* temporary btree cursor */ + + /* + * Get the index of the entry being deleted, check for nothing there. + */ + ptr = cur->bc_ptrs[level]; + if (ptr == 0) { + *stat = 0; + return 0; + } + /* + * Get the buffer & block containing the record or key/ptr. + */ + bp = cur->bc_bufs[level]; + block = XFS_BUF_TO_ALLOC_BLOCK(bp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, block, level, bp))) + return error; +#endif + /* + * Fail if we're off the end of the block. + */ + if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + *stat = 0; + return 0; + } + XFS_STATS_INC(xfsstats.xs_abt_delrec); + /* + * It's a nonleaf. Excise the key and ptr being deleted, by + * sliding the entries past them down one. + * Log the changed areas of the block. + */ + if (level > 0) { + lkp = XFS_ALLOC_KEY_ADDR(block, 1, cur); + lpp = XFS_ALLOC_PTR_ADDR(block, 1, cur); +#ifdef DEBUG + for (i = ptr; i < INT_GET(block->bb_numrecs, ARCH_CONVERT); i++) { + if ((error = xfs_btree_check_sptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level))) + return error; + } +#endif + if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + ovbcopy(&lkp[ptr], &lkp[ptr - 1], + (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*lkp)); /* INT_: mem copy */ + ovbcopy(&lpp[ptr], &lpp[ptr - 1], + (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*lpp)); /* INT_: mem copy */ + xfs_alloc_log_ptrs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1); + xfs_alloc_log_keys(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1); + } + } + /* + * It's a leaf. Excise the record being deleted, by sliding the + * entries past it down one. Log the changed areas of the block. + */ + else { + lrp = XFS_ALLOC_REC_ADDR(block, 1, cur); + if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + ovbcopy(&lrp[ptr], &lrp[ptr - 1], + (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*lrp)); + xfs_alloc_log_recs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1); + } + /* + * If it's the first record in the block, we'll need a key + * structure to pass up to the next level (updkey). + */ + if (ptr == 1) { + key.ar_startblock = lrp->ar_startblock; /* INT_: direct copy */ + key.ar_blockcount = lrp->ar_blockcount; /* INT_: direct copy */ + lkp = &key; + } + } + /* + * Decrement and log the number of entries in the block. + */ + INT_MOD(block->bb_numrecs, ARCH_CONVERT, -1); + xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS); + /* + * See if the longest free extent in the allocation group was + * changed by this operation. True if it's the by-size btree, and + * this is the leaf level, and there is no right sibling block, + * and this was the last record. + */ + agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + mp = cur->bc_mp; + + if (level == 0 && + cur->bc_btnum == XFS_BTNUM_CNT && + INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK && + ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + ASSERT(ptr == INT_GET(block->bb_numrecs, ARCH_CONVERT) + 1); + /* + * There are still records in the block. Grab the size + * from the last one. + */ + if (INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + rrp = XFS_ALLOC_REC_ADDR(block, INT_GET(block->bb_numrecs, ARCH_CONVERT), cur); + INT_COPY(agf->agf_longest, rrp->ar_blockcount, ARCH_CONVERT); + } + /* + * No free extents left. + */ + else + INT_ZERO(agf->agf_longest, ARCH_CONVERT); + mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)].pagf_longest = + INT_GET(agf->agf_longest, ARCH_CONVERT); + xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, + XFS_AGF_LONGEST); + } + /* + * Is this the root level? If so, we're almost done. + */ + if (level == cur->bc_nlevels - 1) { + /* + * If this is the root level, + * and there's only one entry left, + * and it's NOT the leaf level, + * then we can get rid of this level. + */ + if (INT_GET(block->bb_numrecs, ARCH_CONVERT) == 1 && level > 0) { + /* + * lpp is still set to the first pointer in the block. + * Make it the new root of the btree. + */ + bno = INT_GET(agf->agf_roots[cur->bc_btnum], ARCH_CONVERT); + INT_COPY(agf->agf_roots[cur->bc_btnum], *lpp, ARCH_CONVERT); + INT_MOD(agf->agf_levels[cur->bc_btnum], ARCH_CONVERT, -1); + mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)].pagf_levels[cur->bc_btnum]--; + /* + * Put this buffer/block on the ag's freelist. + */ + if ((error = xfs_alloc_put_freelist(cur->bc_tp, + cur->bc_private.a.agbp, NULL, bno))) + return error; + xfs_trans_agbtree_delta(cur->bc_tp, -1); + xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, + XFS_AGF_ROOTS | XFS_AGF_LEVELS); + /* + * Update the cursor so there's one fewer level. + */ + xfs_btree_setbuf(cur, level, 0); + cur->bc_nlevels--; + } else if (level > 0 && + (error = xfs_alloc_decrement(cur, level, &i))) + return error; + *stat = 1; + return 0; + } + /* + * If we deleted the leftmost entry in the block, update the + * key values above us in the tree. + */ + if (ptr == 1 && (error = xfs_alloc_updkey(cur, lkp, level + 1))) + return error; + /* + * If the number of records remaining in the block is at least + * the minimum, we're done. + */ + if (INT_GET(block->bb_numrecs, ARCH_CONVERT) >= XFS_ALLOC_BLOCK_MINRECS(level, cur)) { + if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i))) + return error; + *stat = 1; + return 0; + } + /* + * Otherwise, we have to move some records around to keep the + * tree balanced. Look at the left and right sibling blocks to + * see if we can re-balance by moving only one record. + */ + rbno = INT_GET(block->bb_rightsib, ARCH_CONVERT); + lbno = INT_GET(block->bb_leftsib, ARCH_CONVERT); + bno = NULLAGBLOCK; + ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK); + /* + * Duplicate the cursor so our btree manipulations here won't + * disrupt the next level up. + */ + if ((error = xfs_btree_dup_cursor(cur, &tcur))) + return error; + /* + * If there's a right sibling, see if it's ok to shift an entry + * out of it. + */ + if (rbno != NULLAGBLOCK) { + /* + * Move the temp cursor to the last entry in the next block. + * Actually any entry but the first would suffice. + */ + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_alloc_increment(tcur, level, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * Grab a pointer to the block. + */ + rbp = tcur->bc_bufs[level]; + right = XFS_BUF_TO_ALLOC_BLOCK(rbp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) + goto error0; +#endif + /* + * Grab the current block number, for future use. + */ + bno = INT_GET(right->bb_leftsib, ARCH_CONVERT); + /* + * If right block is full enough so that removing one entry + * won't make it too empty, and left-shifting an entry out + * of right to us works, we're done. + */ + if (INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1 >= + XFS_ALLOC_BLOCK_MINRECS(level, cur)) { + if ((error = xfs_alloc_lshift(tcur, level, &i))) + goto error0; + if (i) { + ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >= + XFS_ALLOC_BLOCK_MINRECS(level, cur)); + xfs_btree_del_cursor(tcur, + XFS_BTREE_NOERROR); + if (level > 0 && + (error = xfs_alloc_decrement(cur, level, + &i))) + return error; + *stat = 1; + return 0; + } + } + /* + * Otherwise, grab the number of records in right for + * future reference, and fix up the temp cursor to point + * to our block again (last record). + */ + rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT); + if (lbno != NULLAGBLOCK) { + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_alloc_decrement(tcur, level, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + } + } + /* + * If there's a left sibling, see if it's ok to shift an entry + * out of it. + */ + if (lbno != NULLAGBLOCK) { + /* + * Move the temp cursor to the first entry in the + * previous block. + */ + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_alloc_decrement(tcur, level, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + xfs_btree_firstrec(tcur, level); + /* + * Grab a pointer to the block. + */ + lbp = tcur->bc_bufs[level]; + left = XFS_BUF_TO_ALLOC_BLOCK(lbp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) + goto error0; +#endif + /* + * Grab the current block number, for future use. + */ + bno = INT_GET(left->bb_rightsib, ARCH_CONVERT); + /* + * If left block is full enough so that removing one entry + * won't make it too empty, and right-shifting an entry out + * of left to us works, we're done. + */ + if (INT_GET(left->bb_numrecs, ARCH_CONVERT) - 1 >= + XFS_ALLOC_BLOCK_MINRECS(level, cur)) { + if ((error = xfs_alloc_rshift(tcur, level, &i))) + goto error0; + if (i) { + ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >= + XFS_ALLOC_BLOCK_MINRECS(level, cur)); + xfs_btree_del_cursor(tcur, + XFS_BTREE_NOERROR); + if (level == 0) + cur->bc_ptrs[0]++; + *stat = 1; + return 0; + } + } + /* + * Otherwise, grab the number of records in right for + * future reference. + */ + lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT); + } + /* + * Delete the temp cursor, we're done with it. + */ + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + /* + * If here, we need to do a join to keep the tree balanced. + */ + ASSERT(bno != NULLAGBLOCK); + /* + * See if we can join with the left neighbor block. + */ + if (lbno != NULLAGBLOCK && + lrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { + /* + * Set "right" to be the starting block, + * "left" to be the left neighbor. + */ + rbno = bno; + right = block; + rbp = bp; + if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, + cur->bc_private.a.agno, lbno, 0, &lbp, + XFS_ALLOC_BTREE_REF))) + return error; + left = XFS_BUF_TO_ALLOC_BLOCK(lbp); + if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) + return error; + } + /* + * If that won't work, see if we can join with the right neighbor block. + */ + else if (rbno != NULLAGBLOCK && + rrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <= + XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { + /* + * Set "left" to be the starting block, + * "right" to be the right neighbor. + */ + lbno = bno; + left = block; + lbp = bp; + if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, + cur->bc_private.a.agno, rbno, 0, &rbp, + XFS_ALLOC_BTREE_REF))) + return error; + right = XFS_BUF_TO_ALLOC_BLOCK(rbp); + if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) + return error; + } + /* + * Otherwise, we can't fix the imbalance. + * Just return. This is probably a logic error, but it's not fatal. + */ + else { + if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i))) + return error; + *stat = 1; + return 0; + } + /* + * We're now going to join "left" and "right" by moving all the stuff + * in "right" to "left" and deleting "right". + */ + if (level > 0) { + /* + * It's a non-leaf. Move keys and pointers. + */ + lkp = XFS_ALLOC_KEY_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur); + lpp = XFS_ALLOC_PTR_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur); + rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur); + rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur); +#ifdef DEBUG + for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) { + if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) + return error; + } +#endif + bcopy(rkp, lkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lkp)); /* INT_: structure copy */ + bcopy(rpp, lpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lpp)); /* INT_: structure copy */ + xfs_alloc_log_keys(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, + INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT)); + xfs_alloc_log_ptrs(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, + INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT)); + } else { + /* + * It's a leaf. Move records. + */ + lrp = XFS_ALLOC_REC_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur); + rrp = XFS_ALLOC_REC_ADDR(right, 1, cur); + bcopy(rrp, lrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lrp)); + xfs_alloc_log_recs(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, + INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT)); + } + /* + * If we joined with the left neighbor, set the buffer in the + * cursor to the left block, and fix up the index. + */ + if (bp != lbp) { + xfs_btree_setbuf(cur, level, lbp); + cur->bc_ptrs[level] += INT_GET(left->bb_numrecs, ARCH_CONVERT); + } + /* + * If we joined with the right neighbor and there's a level above + * us, increment the cursor at that level. + */ + else if (level + 1 < cur->bc_nlevels && + (error = xfs_alloc_increment(cur, level + 1, &i))) + return error; + /* + * Fix up the number of records in the surviving block. + */ + INT_MOD(left->bb_numrecs, ARCH_CONVERT, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + /* + * Fix up the right block pointer in the surviving block, and log it. + */ + left->bb_rightsib = right->bb_rightsib; /* INT_: direct copy */ + xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); + /* + * If there is a right sibling now, make it point to the + * remaining block. + */ + if (INT_GET(left->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) { + xfs_alloc_block_t *rrblock; + xfs_buf_t *rrbp; + + if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, + cur->bc_private.a.agno, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0, + &rrbp, XFS_ALLOC_BTREE_REF))) + return error; + rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp); + if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp))) + return error; + INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, lbno); + xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB); + } + /* + * Free the deleting block by putting it on the freelist. + */ + if ((error = xfs_alloc_put_freelist(cur->bc_tp, cur->bc_private.a.agbp, + NULL, rbno))) + return error; + xfs_trans_agbtree_delta(cur->bc_tp, -1); + /* + * Adjust the current level's cursor so that we're left referring + * to the right node, after we're done. + * If this leaves the ptr value 0 our caller will fix it up. + */ + if (level > 0) + cur->bc_ptrs[level]--; + /* + * Return value means the next level up has something to do. + */ + *stat = 2; + return 0; + +error0: + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); + return error; +} + +/* + * Insert one record/level. Return information to the caller + * allowing the next level up to proceed if necessary. + */ +STATIC int /* error */ +xfs_alloc_insrec( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level to insert record at */ + xfs_agblock_t *bnop, /* i/o: block number inserted */ + xfs_alloc_rec_t *recp, /* i/o: record data inserted */ + xfs_btree_cur_t **curp, /* output: new cursor replacing cur */ + int *stat) /* output: success/failure */ +{ + xfs_agf_t *agf; /* allocation group freelist header */ + xfs_alloc_block_t *block; /* btree block record/key lives in */ + xfs_buf_t *bp; /* buffer for block */ + int error; /* error return value */ + int i; /* loop index */ + xfs_alloc_key_t key; /* key value being inserted */ + xfs_alloc_key_t *kp; /* pointer to btree keys */ + xfs_agblock_t nbno; /* block number of allocated block */ + xfs_btree_cur_t *ncur; /* new cursor to be used at next lvl */ + xfs_alloc_key_t nkey; /* new key value, from split */ + xfs_alloc_rec_t nrec; /* new record value, for caller */ + int optr; /* old ptr value */ + xfs_alloc_ptr_t *pp; /* pointer to btree addresses */ + int ptr; /* index in btree block for this rec */ + xfs_alloc_rec_t *rp; /* pointer to btree records */ + + ASSERT(INT_GET(recp->ar_blockcount, ARCH_CONVERT) > 0); + /* + * If we made it to the root level, allocate a new root block + * and we're done. + */ + if (level >= cur->bc_nlevels) { + XFS_STATS_INC(xfsstats.xs_abt_insrec); + if ((error = xfs_alloc_newroot(cur, &i))) + return error; + *bnop = NULLAGBLOCK; + *stat = i; + return 0; + } + /* + * Make a key out of the record data to be inserted, and save it. + */ + key.ar_startblock = recp->ar_startblock; /* INT_: direct copy */ + key.ar_blockcount = recp->ar_blockcount; /* INT_: direct copy */ + optr = ptr = cur->bc_ptrs[level]; + /* + * If we're off the left edge, return failure. + */ + if (ptr == 0) { + *stat = 0; + return 0; + } + XFS_STATS_INC(xfsstats.xs_abt_insrec); + /* + * Get pointers to the btree buffer and block. + */ + bp = cur->bc_bufs[level]; + block = XFS_BUF_TO_ALLOC_BLOCK(bp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, block, level, bp))) + return error; + /* + * Check that the new entry is being inserted in the right place. + */ + if (ptr <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + if (level == 0) { + rp = XFS_ALLOC_REC_ADDR(block, ptr, cur); + xfs_btree_check_rec(cur->bc_btnum, recp, rp); + } else { + kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur); + xfs_btree_check_key(cur->bc_btnum, &key, kp); + } + } +#endif + nbno = NULLAGBLOCK; + ncur = (xfs_btree_cur_t *)0; + /* + * If the block is full, we can't insert the new entry until we + * make the block un-full. + */ + if (INT_GET(block->bb_numrecs, ARCH_CONVERT) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { + /* + * First, try shifting an entry to the right neighbor. + */ + if ((error = xfs_alloc_rshift(cur, level, &i))) + return error; + if (i) { + /* nothing */ + } + /* + * Next, try shifting an entry to the left neighbor. + */ + else { + if ((error = xfs_alloc_lshift(cur, level, &i))) + return error; + if (i) + optr = ptr = cur->bc_ptrs[level]; + else { + /* + * Next, try splitting the current block in + * half. If this works we have to re-set our + * variables because we could be in a + * different block now. + */ + if ((error = xfs_alloc_split(cur, level, &nbno, + &nkey, &ncur, &i))) + return error; + if (i) { + bp = cur->bc_bufs[level]; + block = XFS_BUF_TO_ALLOC_BLOCK(bp); +#ifdef DEBUG + if ((error = + xfs_btree_check_sblock(cur, + block, level, bp))) + return error; +#endif + ptr = cur->bc_ptrs[level]; + nrec.ar_startblock = nkey.ar_startblock; /* INT_: direct copy */ + nrec.ar_blockcount = nkey.ar_blockcount; /* INT_: direct copy */ + } + /* + * Otherwise the insert fails. + */ + else { + *stat = 0; + return 0; + } + } + } + } + /* + * At this point we know there's room for our new entry in the block + * we're pointing at. + */ + if (level > 0) { + /* + * It's a non-leaf entry. Make a hole for the new data + * in the key and ptr regions of the block. + */ + kp = XFS_ALLOC_KEY_ADDR(block, 1, cur); + pp = XFS_ALLOC_PTR_ADDR(block, 1, cur); +#ifdef DEBUG + for (i = INT_GET(block->bb_numrecs, ARCH_CONVERT); i >= ptr; i--) { + if ((error = xfs_btree_check_sptr(cur, INT_GET(pp[i - 1], ARCH_CONVERT), level))) + return error; + } +#endif + ovbcopy(&kp[ptr - 1], &kp[ptr], + (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*kp)); /* INT_: copy */ + ovbcopy(&pp[ptr - 1], &pp[ptr], + (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*pp)); /* INT_: copy */ +#ifdef DEBUG + if ((error = xfs_btree_check_sptr(cur, *bnop, level))) + return error; +#endif + /* + * Now stuff the new data in, bump numrecs and log the new data. + */ + kp[ptr - 1] = key; + INT_SET(pp[ptr - 1], ARCH_CONVERT, *bnop); + INT_MOD(block->bb_numrecs, ARCH_CONVERT, +1); + xfs_alloc_log_keys(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT)); + xfs_alloc_log_ptrs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT)); +#ifdef DEBUG + if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT)) + xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1, + kp + ptr); +#endif + } else { + /* + * It's a leaf entry. Make a hole for the new record. + */ + rp = XFS_ALLOC_REC_ADDR(block, 1, cur); + ovbcopy(&rp[ptr - 1], &rp[ptr], + (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*rp)); + /* + * Now stuff the new record in, bump numrecs + * and log the new data. + */ + rp[ptr - 1] = *recp; /* INT_: struct copy */ + INT_MOD(block->bb_numrecs, ARCH_CONVERT, +1); + xfs_alloc_log_recs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT)); +#ifdef DEBUG + if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT)) + xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1, + rp + ptr); +#endif + } + /* + * Log the new number of records in the btree header. + */ + xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS); + /* + * If we inserted at the start of a block, update the parents' keys. + */ + if (optr == 1 && (error = xfs_alloc_updkey(cur, &key, level + 1))) + return error; + /* + * Look to see if the longest extent in the allocation group + * needs to be updated. + */ + + agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + if (level == 0 && + cur->bc_btnum == XFS_BTNUM_CNT && + INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK && + INT_GET(recp->ar_blockcount, ARCH_CONVERT) > INT_GET(agf->agf_longest, ARCH_CONVERT)) { + /* + * If this is a leaf in the by-size btree and there + * is no right sibling block and this block is bigger + * than the previous longest block, update it. + */ + INT_COPY(agf->agf_longest, recp->ar_blockcount, ARCH_CONVERT); + cur->bc_mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)].pagf_longest + = INT_GET(recp->ar_blockcount, ARCH_CONVERT); + xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, + XFS_AGF_LONGEST); + } + /* + * Return the new block number, if any. + * If there is one, give back a record value and a cursor too. + */ + *bnop = nbno; + if (nbno != NULLAGBLOCK) { + *recp = nrec; /* INT_: struct copy */ + *curp = ncur; /* INT_: struct copy */ + } + *stat = 1; + return 0; +} + +/* + * Log header fields from a btree block. + */ +STATIC void +xfs_alloc_log_block( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *bp, /* buffer containing btree block */ + int fields) /* mask of fields: XFS_BB_... */ +{ + int first; /* first byte offset logged */ + int last; /* last byte offset logged */ + static const short offsets[] = { /* table of offsets */ + offsetof(xfs_alloc_block_t, bb_magic), + offsetof(xfs_alloc_block_t, bb_level), + offsetof(xfs_alloc_block_t, bb_numrecs), + offsetof(xfs_alloc_block_t, bb_leftsib), + offsetof(xfs_alloc_block_t, bb_rightsib), + sizeof(xfs_alloc_block_t) + }; + + xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last); + xfs_trans_log_buf(tp, bp, first, last); +} + +/* + * Log keys from a btree block (nonleaf). + */ +STATIC void +xfs_alloc_log_keys( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_buf_t *bp, /* buffer containing btree block */ + int kfirst, /* index of first key to log */ + int klast) /* index of last key to log */ +{ + xfs_alloc_block_t *block; /* btree block to log from */ + int first; /* first byte offset logged */ + xfs_alloc_key_t *kp; /* key pointer in btree block */ + int last; /* last byte offset logged */ + + block = XFS_BUF_TO_ALLOC_BLOCK(bp); + kp = XFS_ALLOC_KEY_ADDR(block, 1, cur); + first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block); + last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block); + xfs_trans_log_buf(cur->bc_tp, bp, first, last); +} + +/* + * Log block pointer fields from a btree block (nonleaf). + */ +STATIC void +xfs_alloc_log_ptrs( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_buf_t *bp, /* buffer containing btree block */ + int pfirst, /* index of first pointer to log */ + int plast) /* index of last pointer to log */ +{ + xfs_alloc_block_t *block; /* btree block to log from */ + int first; /* first byte offset logged */ + int last; /* last byte offset logged */ + xfs_alloc_ptr_t *pp; /* block-pointer pointer in btree blk */ + + block = XFS_BUF_TO_ALLOC_BLOCK(bp); + pp = XFS_ALLOC_PTR_ADDR(block, 1, cur); + first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block); + last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block); + xfs_trans_log_buf(cur->bc_tp, bp, first, last); +} + +/* + * Log records from a btree block (leaf). + */ +STATIC void +xfs_alloc_log_recs( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_buf_t *bp, /* buffer containing btree block */ + int rfirst, /* index of first record to log */ + int rlast) /* index of last record to log */ +{ + xfs_alloc_block_t *block; /* btree block to log from */ + int first; /* first byte offset logged */ + int last; /* last byte offset logged */ + xfs_alloc_rec_t *rp; /* record pointer for btree block */ + + + block = XFS_BUF_TO_ALLOC_BLOCK(bp); + rp = XFS_ALLOC_REC_ADDR(block, 1, cur); +#ifdef DEBUG + { + xfs_agf_t *agf; + xfs_alloc_rec_t *p; + + agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + for (p = &rp[rfirst - 1]; p <= &rp[rlast - 1]; p++) + ASSERT(INT_GET(p->ar_startblock, ARCH_CONVERT) + INT_GET(p->ar_blockcount, ARCH_CONVERT) <= + INT_GET(agf->agf_length, ARCH_CONVERT)); + } +#endif + first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block); + last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block); + xfs_trans_log_buf(cur->bc_tp, bp, first, last); +} + +/* + * Lookup the record. The cursor is made to point to it, based on dir. + * Return 0 if can't find any such record, 1 for success. + */ +STATIC int /* error */ +xfs_alloc_lookup( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_lookup_t dir, /* <=, ==, or >= */ + int *stat) /* success/failure */ +{ + xfs_agblock_t agbno; /* a.g. relative btree block number */ + xfs_agnumber_t agno; /* allocation group number */ + xfs_alloc_block_t *block=NULL; /* current btree block */ + int diff; /* difference for the current key */ + int error; /* error return value */ + int keyno=0; /* current key number */ + int level; /* level in the btree */ + xfs_mount_t *mp; /* file system mount point */ + + XFS_STATS_INC(xfsstats.xs_abt_lookup); + /* + * Get the allocation group header, and the root block number. + */ + mp = cur->bc_mp; + + { + xfs_agf_t *agf; /* a.g. freespace header */ + + agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + agno = INT_GET(agf->agf_seqno, ARCH_CONVERT); + agbno = INT_GET(agf->agf_roots[cur->bc_btnum], ARCH_CONVERT); + } + /* + * Iterate over each level in the btree, starting at the root. + * For each level above the leaves, find the key we need, based + * on the lookup record, then follow the corresponding block + * pointer down to the next level. + */ + for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { + xfs_buf_t *bp; /* buffer pointer for btree block */ + xfs_daddr_t d; /* disk address of btree block */ + + /* + * Get the disk address we're looking for. + */ + d = XFS_AGB_TO_DADDR(mp, agno, agbno); + /* + * If the old buffer at this level is for a different block, + * throw it away, otherwise just use it. + */ + bp = cur->bc_bufs[level]; + if (bp && XFS_BUF_ADDR(bp) != d) + bp = (xfs_buf_t *)0; + if (!bp) { + /* + * Need to get a new buffer. Read it, then + * set it in the cursor, releasing the old one. + */ + if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, agno, + agbno, 0, &bp, XFS_ALLOC_BTREE_REF))) + return error; + xfs_btree_setbuf(cur, level, bp); + /* + * Point to the btree block, now that we have the buffer + */ + block = XFS_BUF_TO_ALLOC_BLOCK(bp); + if ((error = xfs_btree_check_sblock(cur, block, level, + bp))) + return error; + } else + block = XFS_BUF_TO_ALLOC_BLOCK(bp); + /* + * If we already had a key match at a higher level, we know + * we need to use the first entry in this block. + */ + if (diff == 0) + keyno = 1; + /* + * Otherwise we need to search this block. Do a binary search. + */ + else { + int high; /* high entry number */ + xfs_alloc_key_t *kkbase=NULL;/* base of keys in block */ + xfs_alloc_rec_t *krbase=NULL;/* base of records in block */ + int low; /* low entry number */ + + /* + * Get a pointer to keys or records. + */ + if (level > 0) + kkbase = XFS_ALLOC_KEY_ADDR(block, 1, cur); + else + krbase = XFS_ALLOC_REC_ADDR(block, 1, cur); + /* + * Set low and high entry numbers, 1-based. + */ + low = 1; + if (!(high = INT_GET(block->bb_numrecs, ARCH_CONVERT))) { + /* + * If the block is empty, the tree must + * be an empty leaf. + */ + ASSERT(level == 0 && cur->bc_nlevels == 1); + cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE; + *stat = 0; + return 0; + } + /* + * Binary search the block. + */ + while (low <= high) { + xfs_extlen_t blockcount; /* key value */ + xfs_agblock_t startblock; /* key value */ + + XFS_STATS_INC(xfsstats.xs_abt_compare); + /* + * keyno is average of low and high. + */ + keyno = (low + high) >> 1; + /* + * Get startblock & blockcount. + */ + if (level > 0) { + xfs_alloc_key_t *kkp; + + kkp = kkbase + keyno - 1; + startblock = INT_GET(kkp->ar_startblock, ARCH_CONVERT); + blockcount = INT_GET(kkp->ar_blockcount, ARCH_CONVERT); + } else { + xfs_alloc_rec_t *krp; + + krp = krbase + keyno - 1; + startblock = INT_GET(krp->ar_startblock, ARCH_CONVERT); + blockcount = INT_GET(krp->ar_blockcount, ARCH_CONVERT); + } + /* + * Compute difference to get next direction. + */ + if (cur->bc_btnum == XFS_BTNUM_BNO) + diff = (int)startblock - + (int)cur->bc_rec.a.ar_startblock; + else if (!(diff = (int)blockcount - + (int)cur->bc_rec.a.ar_blockcount)) + diff = (int)startblock - + (int)cur->bc_rec.a.ar_startblock; + /* + * Less than, move right. + */ + if (diff < 0) + low = keyno + 1; + /* + * Greater than, move left. + */ + else if (diff > 0) + high = keyno - 1; + /* + * Equal, we're done. + */ + else + break; + } + } + /* + * If there are more levels, set up for the next level + * by getting the block number and filling in the cursor. + */ + if (level > 0) { + /* + * If we moved left, need the previous key number, + * unless there isn't one. + */ + if (diff > 0 && --keyno < 1) + keyno = 1; + agbno = INT_GET(*XFS_ALLOC_PTR_ADDR(block, keyno, cur), ARCH_CONVERT); +#ifdef DEBUG + if ((error = xfs_btree_check_sptr(cur, agbno, level))) + return error; +#endif + cur->bc_ptrs[level] = keyno; + } + } + /* + * Done with the search. + * See if we need to adjust the results. + */ + if (dir != XFS_LOOKUP_LE && diff < 0) { + keyno++; + /* + * If ge search and we went off the end of the block, but it's + * not the last block, we're in the wrong block. + */ + if (dir == XFS_LOOKUP_GE && + keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT) && + INT_GET(block->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) { + int i; + + cur->bc_ptrs[0] = keyno; + if ((error = xfs_alloc_increment(cur, 0, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + *stat = 1; + return 0; + } + } + else if (dir == XFS_LOOKUP_LE && diff > 0) + keyno--; + cur->bc_ptrs[0] = keyno; + /* + * Return if we succeeded or not. + */ + if (keyno == 0 || keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT)) + *stat = 0; + else + *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0)); + return 0; +} + +/* + * Move 1 record left from cur/level if possible. + * Update cur to reflect the new path. + */ +STATIC int /* error */ +xfs_alloc_lshift( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level to shift record on */ + int *stat) /* success/failure */ +{ + int error; /* error return value */ +#ifdef DEBUG + int i; /* loop index */ +#endif + xfs_alloc_key_t key; /* key value for leaf level upward */ + xfs_buf_t *lbp; /* buffer for left neighbor block */ + xfs_alloc_block_t *left; /* left neighbor btree block */ + int nrec; /* new number of left block entries */ + xfs_buf_t *rbp; /* buffer for right (current) block */ + xfs_alloc_block_t *right; /* right (current) btree block */ + xfs_alloc_key_t *rkp=NULL; /* key pointer for right block */ + xfs_alloc_ptr_t *rpp=NULL; /* address pointer for right block */ + xfs_alloc_rec_t *rrp=NULL; /* record pointer for right block */ + + /* + * Set up variables for this block as "right". + */ + rbp = cur->bc_bufs[level]; + right = XFS_BUF_TO_ALLOC_BLOCK(rbp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) + return error; +#endif + /* + * If we've got no left sibling then we can't shift an entry left. + */ + if (INT_GET(right->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK) { + *stat = 0; + return 0; + } + /* + * If the cursor entry is the one that would be moved, don't + * do it... it's too complicated. + */ + if (cur->bc_ptrs[level] <= 1) { + *stat = 0; + return 0; + } + /* + * Set up the left neighbor as "left". + */ + if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agno, INT_GET(right->bb_leftsib, ARCH_CONVERT), 0, &lbp, + XFS_ALLOC_BTREE_REF))) + return error; + left = XFS_BUF_TO_ALLOC_BLOCK(lbp); + if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) + return error; + /* + * If it's full, it can't take another entry. + */ + if (INT_GET(left->bb_numrecs, ARCH_CONVERT) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { + *stat = 0; + return 0; + } + nrec = INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1; + /* + * If non-leaf, copy a key and a ptr to the left block. + */ + if (level > 0) { + xfs_alloc_key_t *lkp; /* key pointer for left block */ + xfs_alloc_ptr_t *lpp; /* address pointer for left block */ + + lkp = XFS_ALLOC_KEY_ADDR(left, nrec, cur); + rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur); + *lkp = *rkp; + xfs_alloc_log_keys(cur, lbp, nrec, nrec); + lpp = XFS_ALLOC_PTR_ADDR(left, nrec, cur); + rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur); +#ifdef DEBUG + if ((error = xfs_btree_check_sptr(cur, INT_GET(*rpp, ARCH_CONVERT), level))) + return error; +#endif + *lpp = *rpp; /* INT_: copy */ + xfs_alloc_log_ptrs(cur, lbp, nrec, nrec); + xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp); + } + /* + * If leaf, copy a record to the left block. + */ + else { + xfs_alloc_rec_t *lrp; /* record pointer for left block */ + + lrp = XFS_ALLOC_REC_ADDR(left, nrec, cur); + rrp = XFS_ALLOC_REC_ADDR(right, 1, cur); + *lrp = *rrp; + xfs_alloc_log_recs(cur, lbp, nrec, nrec); + xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp); + } + /* + * Bump and log left's numrecs, decrement and log right's numrecs. + */ + INT_MOD(left->bb_numrecs, ARCH_CONVERT, +1); + xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS); + INT_MOD(right->bb_numrecs, ARCH_CONVERT, -1); + xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS); + /* + * Slide the contents of right down one entry. + */ + if (level > 0) { +#ifdef DEBUG + for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) { + if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i + 1], ARCH_CONVERT), + level))) + return error; + } +#endif + ovbcopy(rkp + 1, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp)); + ovbcopy(rpp + 1, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp)); + xfs_alloc_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + xfs_alloc_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + } else { + ovbcopy(rrp + 1, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp)); + xfs_alloc_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + key.ar_startblock = rrp->ar_startblock; /* INT_: direct copy */ + key.ar_blockcount = rrp->ar_blockcount; /* INT_: direct copy */ + rkp = &key; + } + /* + * Update the parent key values of right. + */ + if ((error = xfs_alloc_updkey(cur, rkp, level + 1))) + return error; + /* + * Slide the cursor value left one. + */ + cur->bc_ptrs[level]--; + *stat = 1; + return 0; +} + +/* + * Allocate a new root block, fill it in. + */ +STATIC int /* error */ +xfs_alloc_newroot( + xfs_btree_cur_t *cur, /* btree cursor */ + int *stat) /* success/failure */ +{ + int error; /* error return value */ + xfs_agblock_t lbno; /* left block number */ + xfs_buf_t *lbp; /* left btree buffer */ + xfs_alloc_block_t *left; /* left btree block */ + xfs_mount_t *mp; /* mount structure */ + xfs_agblock_t nbno; /* new block number */ + xfs_buf_t *nbp; /* new (root) buffer */ + xfs_alloc_block_t *new; /* new (root) btree block */ + int nptr; /* new value for key index, 1 or 2 */ + xfs_agblock_t rbno; /* right block number */ + xfs_buf_t *rbp; /* right btree buffer */ + xfs_alloc_block_t *right; /* right btree block */ + + mp = cur->bc_mp; + + ASSERT(cur->bc_nlevels < XFS_AG_MAXLEVELS(mp)); + /* + * Get a buffer from the freelist blocks, for the new root. + */ + if ((error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, + &nbno))) + return error; + /* + * None available, we fail. + */ + if (nbno == NULLAGBLOCK) { + *stat = 0; + return 0; + } + xfs_trans_agbtree_delta(cur->bc_tp, 1); + nbp = xfs_btree_get_bufs(mp, cur->bc_tp, cur->bc_private.a.agno, nbno, + 0); + new = XFS_BUF_TO_ALLOC_BLOCK(nbp); + /* + * Set the root data in the a.g. freespace structure. + */ + { + xfs_agf_t *agf; /* a.g. freespace header */ + xfs_agnumber_t seqno; + + agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + INT_SET(agf->agf_roots[cur->bc_btnum], ARCH_CONVERT, nbno); + INT_MOD(agf->agf_levels[cur->bc_btnum], ARCH_CONVERT, 1); + seqno = INT_GET(agf->agf_seqno, ARCH_CONVERT); + mp->m_perag[seqno].pagf_levels[cur->bc_btnum]++; + xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, + XFS_AGF_ROOTS | XFS_AGF_LEVELS); + } + /* + * At the previous root level there are now two blocks: the old + * root, and the new block generated when it was split. + * We don't know which one the cursor is pointing at, so we + * set up variables "left" and "right" for each case. + */ + lbp = cur->bc_bufs[cur->bc_nlevels - 1]; + left = XFS_BUF_TO_ALLOC_BLOCK(lbp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, left, cur->bc_nlevels - 1, lbp))) + return error; +#endif + if (INT_GET(left->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) { + /* + * Our block is left, pick up the right block. + */ + lbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(lbp)); + rbno = INT_GET(left->bb_rightsib, ARCH_CONVERT); + if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, + cur->bc_private.a.agno, rbno, 0, &rbp, + XFS_ALLOC_BTREE_REF))) + return error; + right = XFS_BUF_TO_ALLOC_BLOCK(rbp); + if ((error = xfs_btree_check_sblock(cur, right, + cur->bc_nlevels - 1, rbp))) + return error; + nptr = 1; + } else { + /* + * Our block is right, pick up the left block. + */ + rbp = lbp; + right = left; + rbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(rbp)); + lbno = INT_GET(right->bb_leftsib, ARCH_CONVERT); + if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, + cur->bc_private.a.agno, lbno, 0, &lbp, + XFS_ALLOC_BTREE_REF))) + return error; + left = XFS_BUF_TO_ALLOC_BLOCK(lbp); + if ((error = xfs_btree_check_sblock(cur, left, + cur->bc_nlevels - 1, lbp))) + return error; + nptr = 2; + } + /* + * Fill in the new block's btree header and log it. + */ + INT_SET(new->bb_magic, ARCH_CONVERT, xfs_magics[cur->bc_btnum]); + INT_SET(new->bb_level, ARCH_CONVERT, (__uint16_t)cur->bc_nlevels); + INT_SET(new->bb_numrecs, ARCH_CONVERT, 2); + INT_SET(new->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK); + INT_SET(new->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK); + xfs_alloc_log_block(cur->bc_tp, nbp, XFS_BB_ALL_BITS); + ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK); + /* + * Fill in the key data in the new root. + */ + { + xfs_alloc_key_t *kp; /* btree key pointer */ + + kp = XFS_ALLOC_KEY_ADDR(new, 1, cur); + if (INT_GET(left->bb_level, ARCH_CONVERT) > 0) { + kp[0] = *XFS_ALLOC_KEY_ADDR(left, 1, cur); /* INT_: structure copy */ + kp[1] = *XFS_ALLOC_KEY_ADDR(right, 1, cur);/* INT_: structure copy */ + } else { + xfs_alloc_rec_t *rp; /* btree record pointer */ + + rp = XFS_ALLOC_REC_ADDR(left, 1, cur); + kp[0].ar_startblock = rp->ar_startblock; /* INT_: direct copy */ + kp[0].ar_blockcount = rp->ar_blockcount; /* INT_: direct copy */ + rp = XFS_ALLOC_REC_ADDR(right, 1, cur); + kp[1].ar_startblock = rp->ar_startblock; /* INT_: direct copy */ + kp[1].ar_blockcount = rp->ar_blockcount; /* INT_: direct copy */ + } + } + xfs_alloc_log_keys(cur, nbp, 1, 2); + /* + * Fill in the pointer data in the new root. + */ + { + xfs_alloc_ptr_t *pp; /* btree address pointer */ + + pp = XFS_ALLOC_PTR_ADDR(new, 1, cur); + INT_SET(pp[0], ARCH_CONVERT, lbno); + INT_SET(pp[1], ARCH_CONVERT, rbno); + } + xfs_alloc_log_ptrs(cur, nbp, 1, 2); + /* + * Fix up the cursor. + */ + xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); + cur->bc_ptrs[cur->bc_nlevels] = nptr; + cur->bc_nlevels++; + *stat = 1; + return 0; +} + +/* + * Move 1 record right from cur/level if possible. + * Update cur to reflect the new path. + */ +STATIC int /* error */ +xfs_alloc_rshift( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level to shift record on */ + int *stat) /* success/failure */ +{ + int error; /* error return value */ + int i; /* loop index */ + xfs_alloc_key_t key; /* key value for leaf level upward */ + xfs_buf_t *lbp; /* buffer for left (current) block */ + xfs_alloc_block_t *left; /* left (current) btree block */ + xfs_buf_t *rbp; /* buffer for right neighbor block */ + xfs_alloc_block_t *right; /* right neighbor btree block */ + xfs_alloc_key_t *rkp; /* key pointer for right block */ + xfs_btree_cur_t *tcur; /* temporary cursor */ + + /* + * Set up variables for this block as "left". + */ + lbp = cur->bc_bufs[level]; + left = XFS_BUF_TO_ALLOC_BLOCK(lbp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) + return error; +#endif + /* + * If we've got no right sibling then we can't shift an entry right. + */ + if (INT_GET(left->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK) { + *stat = 0; + return 0; + } + /* + * If the cursor entry is the one that would be moved, don't + * do it... it's too complicated. + */ + if (cur->bc_ptrs[level] >= INT_GET(left->bb_numrecs, ARCH_CONVERT)) { + *stat = 0; + return 0; + } + /* + * Set up the right neighbor as "right". + */ + if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agno, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0, &rbp, + XFS_ALLOC_BTREE_REF))) + return error; + right = XFS_BUF_TO_ALLOC_BLOCK(rbp); + if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) + return error; + /* + * If it's full, it can't take another entry. + */ + if (INT_GET(right->bb_numrecs, ARCH_CONVERT) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { + *stat = 0; + return 0; + } + /* + * Make a hole at the start of the right neighbor block, then + * copy the last left block entry to the hole. + */ + if (level > 0) { + xfs_alloc_key_t *lkp; /* key pointer for left block */ + xfs_alloc_ptr_t *lpp; /* address pointer for left block */ + xfs_alloc_ptr_t *rpp; /* address pointer for right block */ + + lkp = XFS_ALLOC_KEY_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur); + lpp = XFS_ALLOC_PTR_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur); + rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur); + rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur); +#ifdef DEBUG + for (i = INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1; i >= 0; i--) { + if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) + return error; + } +#endif + ovbcopy(rkp, rkp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp)); + ovbcopy(rpp, rpp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp)); +#ifdef DEBUG + if ((error = xfs_btree_check_sptr(cur, INT_GET(*lpp, ARCH_CONVERT), level))) + return error; +#endif + *rkp = *lkp; /* INT_: copy */ + *rpp = *lpp; /* INT_: copy */ + xfs_alloc_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1); + xfs_alloc_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1); + xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1); + } else { + xfs_alloc_rec_t *lrp; /* record pointer for left block */ + xfs_alloc_rec_t *rrp; /* record pointer for right block */ + + lrp = XFS_ALLOC_REC_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur); + rrp = XFS_ALLOC_REC_ADDR(right, 1, cur); + ovbcopy(rrp, rrp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp)); + *rrp = *lrp; + xfs_alloc_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1); + key.ar_startblock = rrp->ar_startblock; /* INT_: direct copy */ + key.ar_blockcount = rrp->ar_blockcount; /* INT_: direct copy */ + rkp = &key; + xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1); + } + /* + * Decrement and log left's numrecs, bump and log right's numrecs. + */ + INT_MOD(left->bb_numrecs, ARCH_CONVERT, -1); + xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS); + INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1); + xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS); + /* + * Using a temporary cursor, update the parent key values of the + * block on the right. + */ + if ((error = xfs_btree_dup_cursor(cur, &tcur))) + return error; + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_alloc_increment(tcur, level, &i)) || + (error = xfs_alloc_updkey(tcur, rkp, level + 1))) + goto error0; + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + *stat = 1; + return 0; +error0: + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); + return error; +} + +/* + * Split cur/level block in half. + * Return new block number and its first record (to be inserted into parent). + */ +STATIC int /* error */ +xfs_alloc_split( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level to split */ + xfs_agblock_t *bnop, /* output: block number allocated */ + xfs_alloc_key_t *keyp, /* output: first key of new block */ + xfs_btree_cur_t **curp, /* output: new cursor */ + int *stat) /* success/failure */ +{ + int error; /* error return value */ + int i; /* loop index/record number */ + xfs_agblock_t lbno; /* left (current) block number */ + xfs_buf_t *lbp; /* buffer for left block */ + xfs_alloc_block_t *left; /* left (current) btree block */ + xfs_agblock_t rbno; /* right (new) block number */ + xfs_buf_t *rbp; /* buffer for right block */ + xfs_alloc_block_t *right; /* right (new) btree block */ + + /* + * Allocate the new block from the freelist. + * If we can't do it, we're toast. Give up. + */ + if ((error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, + &rbno))) + return error; + if (rbno == NULLAGBLOCK) { + *stat = 0; + return 0; + } + xfs_trans_agbtree_delta(cur->bc_tp, 1); + rbp = xfs_btree_get_bufs(cur->bc_mp, cur->bc_tp, cur->bc_private.a.agno, + rbno, 0); + /* + * Set up the new block as "right". + */ + right = XFS_BUF_TO_ALLOC_BLOCK(rbp); + /* + * "Left" is the current (according to the cursor) block. + */ + lbp = cur->bc_bufs[level]; + left = XFS_BUF_TO_ALLOC_BLOCK(lbp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) + return error; +#endif + /* + * Fill in the btree header for the new block. + */ + INT_SET(right->bb_magic, ARCH_CONVERT, xfs_magics[cur->bc_btnum]); + right->bb_level = left->bb_level; /* INT_: direct copy */ + INT_SET(right->bb_numrecs, ARCH_CONVERT, (__uint16_t)(INT_GET(left->bb_numrecs, ARCH_CONVERT) / 2)); + /* + * Make sure that if there's an odd number of entries now, that + * each new block will have the same number of entries. + */ + if ((INT_GET(left->bb_numrecs, ARCH_CONVERT) & 1) && + cur->bc_ptrs[level] <= INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1) + INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1); + i = INT_GET(left->bb_numrecs, ARCH_CONVERT) - INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1; + /* + * For non-leaf blocks, copy keys and addresses over to the new block. + */ + if (level > 0) { + xfs_alloc_key_t *lkp; /* left btree key pointer */ + xfs_alloc_ptr_t *lpp; /* left btree address pointer */ + xfs_alloc_key_t *rkp; /* right btree key pointer */ + xfs_alloc_ptr_t *rpp; /* right btree address pointer */ + + lkp = XFS_ALLOC_KEY_ADDR(left, i, cur); + lpp = XFS_ALLOC_PTR_ADDR(left, i, cur); + rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur); + rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur); +#ifdef DEBUG + for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) { + if ((error = xfs_btree_check_sptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level))) + return error; + } +#endif + bcopy(lkp, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp)); /* INT_: copy */ + bcopy(lpp, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));/* INT_: copy */ + xfs_alloc_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + xfs_alloc_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + *keyp = *rkp; + } + /* + * For leaf blocks, copy records over to the new block. + */ + else { + xfs_alloc_rec_t *lrp; /* left btree record pointer */ + xfs_alloc_rec_t *rrp; /* right btree record pointer */ + + lrp = XFS_ALLOC_REC_ADDR(left, i, cur); + rrp = XFS_ALLOC_REC_ADDR(right, 1, cur); + bcopy(lrp, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp)); + xfs_alloc_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + keyp->ar_startblock = rrp->ar_startblock; /* INT_: direct copy */ + keyp->ar_blockcount = rrp->ar_blockcount; /* INT_: direct copy */ + } + /* + * Find the left block number by looking in the buffer. + * Adjust numrecs, sibling pointers. + */ + lbno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(lbp)); + INT_MOD(left->bb_numrecs, ARCH_CONVERT, -(INT_GET(right->bb_numrecs, ARCH_CONVERT))); + right->bb_rightsib = left->bb_rightsib; /* INT_: direct copy */ + INT_SET(left->bb_rightsib, ARCH_CONVERT, rbno); + INT_SET(right->bb_leftsib, ARCH_CONVERT, lbno); + xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_ALL_BITS); + xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); + /* + * If there's a block to the new block's right, make that block + * point back to right instead of to left. + */ + if (INT_GET(right->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) { + xfs_alloc_block_t *rrblock; /* rr btree block */ + xfs_buf_t *rrbp; /* buffer for rrblock */ + + if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agno, INT_GET(right->bb_rightsib, ARCH_CONVERT), 0, + &rrbp, XFS_ALLOC_BTREE_REF))) + return error; + rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp); + if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp))) + return error; + INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, rbno); + xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB); + } + /* + * If the cursor is really in the right block, move it there. + * If it's just pointing past the last entry in left, then we'll + * insert there, so don't change anything in that case. + */ + if (cur->bc_ptrs[level] > INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1) { + xfs_btree_setbuf(cur, level, rbp); + cur->bc_ptrs[level] -= INT_GET(left->bb_numrecs, ARCH_CONVERT); + } + /* + * If there are more levels, we'll need another cursor which refers to + * the right block, no matter where this cursor was. + */ + if (level + 1 < cur->bc_nlevels) { + if ((error = xfs_btree_dup_cursor(cur, curp))) + return error; + (*curp)->bc_ptrs[level + 1]++; + } + *bnop = rbno; + *stat = 1; + return 0; +} + +/* + * Update keys at all levels from here to the root along the cursor's path. + */ +STATIC int /* error */ +xfs_alloc_updkey( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_alloc_key_t *keyp, /* new key value to update to */ + int level) /* starting level for update */ +{ + int ptr; /* index of key in block */ + + /* + * Go up the tree from this level toward the root. + * At each level, update the key value to the value input. + * Stop when we reach a level where the cursor isn't pointing + * at the first entry in the block. + */ + for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { + xfs_alloc_block_t *block; /* btree block */ + xfs_buf_t *bp; /* buffer for block */ +#ifdef DEBUG + int error; /* error return value */ +#endif + xfs_alloc_key_t *kp; /* ptr to btree block keys */ + + bp = cur->bc_bufs[level]; + block = XFS_BUF_TO_ALLOC_BLOCK(bp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, block, level, bp))) + return error; +#endif + ptr = cur->bc_ptrs[level]; + kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur); + *kp = *keyp; + xfs_alloc_log_keys(cur, bp, ptr, ptr); + } + return 0; +} + +/* + * Externally visible routines. + */ + +/* + * Decrement cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_alloc_decrement( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level in btree, 0 is leaf */ + int *stat) /* success/failure */ +{ + xfs_alloc_block_t *block; /* btree block */ + int error; /* error return value */ + int lev; /* btree level */ + + ASSERT(level < cur->bc_nlevels); + /* + * Read-ahead to the left at this level. + */ + xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); + /* + * Decrement the ptr at this level. If we're still in the block + * then we're done. + */ + if (--cur->bc_ptrs[level] > 0) { + *stat = 1; + return 0; + } + /* + * Get a pointer to the btree block. + */ + block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[level]); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, block, level, + cur->bc_bufs[level]))) + return error; +#endif + /* + * If we just went off the left edge of the tree, return failure. + */ + if (INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK) { + *stat = 0; + return 0; + } + /* + * March up the tree decrementing pointers. + * Stop when we don't go off the left edge of a block. + */ + for (lev = level + 1; lev < cur->bc_nlevels; lev++) { + if (--cur->bc_ptrs[lev] > 0) + break; + /* + * Read-ahead the left block, we're going to read it + * in the next loop. + */ + xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); + } + /* + * If we went off the root then we are seriously confused. + */ + ASSERT(lev < cur->bc_nlevels); + /* + * Now walk back down the tree, fixing up the cursor's buffer + * pointers and key numbers. + */ + for (block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]); lev > level; ) { + xfs_agblock_t agbno; /* block number of btree block */ + xfs_buf_t *bp; /* buffer pointer for block */ + + agbno = INT_GET(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT); + if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agno, agbno, 0, &bp, + XFS_ALLOC_BTREE_REF))) + return error; + lev--; + xfs_btree_setbuf(cur, lev, bp); + block = XFS_BUF_TO_ALLOC_BLOCK(bp); + if ((error = xfs_btree_check_sblock(cur, block, lev, bp))) + return error; + cur->bc_ptrs[lev] = INT_GET(block->bb_numrecs, ARCH_CONVERT); + } + *stat = 1; + return 0; +} + +/* + * Delete the record pointed to by cur. + * The cursor refers to the place where the record was (could be inserted) + * when the operation returns. + */ +int /* error */ +xfs_alloc_delete( + xfs_btree_cur_t *cur, /* btree cursor */ + int *stat) /* success/failure */ +{ + int error; /* error return value */ + int i; /* result code */ + int level; /* btree level */ + + /* + * Go up the tree, starting at leaf level. + * If 2 is returned then a join was done; go to the next level. + * Otherwise we are done. + */ + for (level = 0, i = 2; i == 2; level++) { + if ((error = xfs_alloc_delrec(cur, level, &i))) + return error; + } + if (i == 0) { + for (level = 1; level < cur->bc_nlevels; level++) { + if (cur->bc_ptrs[level] == 0) { + if ((error = xfs_alloc_decrement(cur, level, &i))) + return error; + break; + } + } + } + *stat = i; + return 0; +} + +/* + * Get the data from the pointed-to record. + */ +int /* error */ +xfs_alloc_get_rec( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_agblock_t *bno, /* output: starting block of extent */ + xfs_extlen_t *len, /* output: length of extent */ + int *stat) /* output: success/failure */ +{ + xfs_alloc_block_t *block; /* btree block */ +#ifdef DEBUG + int error; /* error return value */ +#endif + int ptr; /* record number */ + + ptr = cur->bc_ptrs[0]; + block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0]))) + return error; +#endif + /* + * Off the right end or left end, return failure. + */ + if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT) || ptr <= 0) { + *stat = 0; + return 0; + } + /* + * Point to the record and extract its data. + */ + { + xfs_alloc_rec_t *rec; /* record data */ + + rec = XFS_ALLOC_REC_ADDR(block, ptr, cur); + *bno = INT_GET(rec->ar_startblock, ARCH_CONVERT); + *len = INT_GET(rec->ar_blockcount, ARCH_CONVERT); + } + *stat = 1; + return 0; +} + +/* + * Increment cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_alloc_increment( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level in btree, 0 is leaf */ + int *stat) /* success/failure */ +{ + xfs_alloc_block_t *block; /* btree block */ + xfs_buf_t *bp; /* tree block buffer */ + int error; /* error return value */ + int lev; /* btree level */ + + ASSERT(level < cur->bc_nlevels); + /* + * Read-ahead to the right at this level. + */ + xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); + /* + * Get a pointer to the btree block. + */ + bp = cur->bc_bufs[level]; + block = XFS_BUF_TO_ALLOC_BLOCK(bp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, block, level, bp))) + return error; +#endif + /* + * Increment the ptr at this level. If we're still in the block + * then we're done. + */ + if (++cur->bc_ptrs[level] <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + *stat = 1; + return 0; + } + /* + * If we just went off the right edge of the tree, return failure. + */ + if (INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK) { + *stat = 0; + return 0; + } + /* + * March up the tree incrementing pointers. + * Stop when we don't go off the right edge of a block. + */ + for (lev = level + 1; lev < cur->bc_nlevels; lev++) { + bp = cur->bc_bufs[lev]; + block = XFS_BUF_TO_ALLOC_BLOCK(bp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, block, lev, bp))) + return error; +#endif + if (++cur->bc_ptrs[lev] <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) + break; + /* + * Read-ahead the right block, we're going to read it + * in the next loop. + */ + xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); + } + /* + * If we went off the root then we are seriously confused. + */ + ASSERT(lev < cur->bc_nlevels); + /* + * Now walk back down the tree, fixing up the cursor's buffer + * pointers and key numbers. + */ + for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_ALLOC_BLOCK(bp); + lev > level; ) { + xfs_agblock_t agbno; /* block number of btree block */ + + agbno = INT_GET(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT); + if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agno, agbno, 0, &bp, + XFS_ALLOC_BTREE_REF))) + return error; + lev--; + xfs_btree_setbuf(cur, lev, bp); + block = XFS_BUF_TO_ALLOC_BLOCK(bp); + if ((error = xfs_btree_check_sblock(cur, block, lev, bp))) + return error; + cur->bc_ptrs[lev] = 1; + } + *stat = 1; + return 0; +} + +/* + * Insert the current record at the point referenced by cur. + * The cursor may be inconsistent on return if splits have been done. + */ +int /* error */ +xfs_alloc_insert( + xfs_btree_cur_t *cur, /* btree cursor */ + int *stat) /* success/failure */ +{ + int error; /* error return value */ + int i; /* result value, 0 for failure */ + int level; /* current level number in btree */ + xfs_agblock_t nbno; /* new block number (split result) */ + xfs_btree_cur_t *ncur; /* new cursor (split result) */ + xfs_alloc_rec_t nrec; /* record being inserted this level */ + xfs_btree_cur_t *pcur; /* previous level's cursor */ + + level = 0; + nbno = NULLAGBLOCK; + INT_SET(nrec.ar_startblock, ARCH_CONVERT, cur->bc_rec.a.ar_startblock); + INT_SET(nrec.ar_blockcount, ARCH_CONVERT, cur->bc_rec.a.ar_blockcount); + ncur = (xfs_btree_cur_t *)0; + pcur = cur; + /* + * Loop going up the tree, starting at the leaf level. + * Stop when we don't get a split block, that must mean that + * the insert is finished with this level. + */ + do { + /* + * Insert nrec/nbno into this level of the tree. + * Note if we fail, nbno will be null. + */ + if ((error = xfs_alloc_insrec(pcur, level++, &nbno, &nrec, &ncur, + &i))) { + if (pcur != cur) + xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); + return error; + } + /* + * See if the cursor we just used is trash. + * Can't trash the caller's cursor, but otherwise we should + * if ncur is a new cursor or we're about to be done. + */ + if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) { + cur->bc_nlevels = pcur->bc_nlevels; + xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); + } + /* + * If we got a new cursor, switch to it. + */ + if (ncur) { + pcur = ncur; + ncur = (xfs_btree_cur_t *)0; + } + } while (nbno != NULLAGBLOCK); + *stat = i; + return 0; +} + +/* + * Lookup the record equal to [bno, len] in the btree given by cur. + */ +int /* error */ +xfs_alloc_lookup_eq( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat) /* success/failure */ +{ + cur->bc_rec.a.ar_startblock = bno; + cur->bc_rec.a.ar_blockcount = len; + return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, stat); +} + +/* + * Lookup the first record greater than or equal to [bno, len] + * in the btree given by cur. + */ +int /* error */ +xfs_alloc_lookup_ge( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat) /* success/failure */ +{ + cur->bc_rec.a.ar_startblock = bno; + cur->bc_rec.a.ar_blockcount = len; + return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, stat); +} + +/* + * Lookup the first record less than or equal to [bno, len] + * in the btree given by cur. + */ +int /* error */ +xfs_alloc_lookup_le( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat) /* success/failure */ +{ + cur->bc_rec.a.ar_startblock = bno; + cur->bc_rec.a.ar_blockcount = len; + return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, stat); +} + +/* + * Update the record referred to by cur, to the value given by [bno, len]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +int /* error */ +xfs_alloc_update( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len) /* length of extent */ +{ + xfs_alloc_block_t *block; /* btree block to update */ + int error; /* error return value */ + int ptr; /* current record number (updating) */ + + ASSERT(len > 0); + /* + * Pick up the a.g. freelist struct and the current block. + */ + block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0]))) + return error; +#endif + /* + * Get the address of the rec to be updated. + */ + ptr = cur->bc_ptrs[0]; + { + xfs_alloc_rec_t *rp; /* pointer to updated record */ + + rp = XFS_ALLOC_REC_ADDR(block, ptr, cur); + /* + * Fill in the new contents and log them. + */ + INT_SET(rp->ar_startblock, ARCH_CONVERT, bno); + INT_SET(rp->ar_blockcount, ARCH_CONVERT, len); + xfs_alloc_log_recs(cur, cur->bc_bufs[0], ptr, ptr); + } + /* + * If it's the by-size btree and it's the last leaf block and + * it's the last record... then update the size of the longest + * extent in the a.g., which we cache in the a.g. freelist header. + */ + if (cur->bc_btnum == XFS_BTNUM_CNT && + INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK && + ptr == INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + xfs_agf_t *agf; /* a.g. freespace header */ + xfs_agnumber_t seqno; + + agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + seqno = INT_GET(agf->agf_seqno, ARCH_CONVERT); + cur->bc_mp->m_perag[seqno].pagf_longest = len; + INT_SET(agf->agf_longest, ARCH_CONVERT, len); + xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, + XFS_AGF_LONGEST); + } + /* + * Updating first record in leaf. Pass new key value up to our parent. + */ + if (ptr == 1) { + xfs_alloc_key_t key; /* key containing [bno, len] */ + + INT_SET(key.ar_startblock, ARCH_CONVERT, bno); + INT_SET(key.ar_blockcount, ARCH_CONVERT, len); + if ((error = xfs_alloc_updkey(cur, &key, 1))) + return error; + } + return 0; +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_alloc_btree.h linux-2.4-xfs/fs/xfs/xfs_alloc_btree.h --- linux-2.4.19/fs/xfs/xfs_alloc_btree.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_alloc_btree.h Wed Jul 10 23:13:50 2002 @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_ALLOC_BTREE_H__ +#define __XFS_ALLOC_BTREE_H__ + +/* + * Freespace on-disk structures + */ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_btree_sblock; +struct xfs_mount; + +/* + * There are two on-disk btrees, one sorted by blockno and one sorted + * by blockcount and blockno. All blocks look the same to make the code + * simpler; if we have time later, we'll make the optimizations. + */ +#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */ +#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */ + +/* + * Data record/key structure + */ +typedef struct xfs_alloc_rec +{ + xfs_agblock_t ar_startblock; /* starting block number */ + xfs_extlen_t ar_blockcount; /* count of free blocks */ +} xfs_alloc_rec_t, xfs_alloc_key_t; + +typedef xfs_agblock_t xfs_alloc_ptr_t; /* btree pointer type */ + /* btree block header type */ +typedef struct xfs_btree_sblock xfs_alloc_block_t; + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_ALLOC_BLOCK) +xfs_alloc_block_t *xfs_buf_to_alloc_block(struct xfs_buf *bp); +#define XFS_BUF_TO_ALLOC_BLOCK(bp) xfs_buf_to_alloc_block(bp) +#else +#define XFS_BUF_TO_ALLOC_BLOCK(bp) ((xfs_alloc_block_t *)(XFS_BUF_PTR(bp))) +#endif + +/* + * Real block structures have a size equal to the disk block size. + */ + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_BLOCK_SIZE) +int xfs_alloc_block_size(int lev, struct xfs_btree_cur *cur); +#define XFS_ALLOC_BLOCK_SIZE(lev,cur) xfs_alloc_block_size(lev,cur) +#else +#define XFS_ALLOC_BLOCK_SIZE(lev,cur) (1 << (cur)->bc_blocklog) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_BLOCK_MAXRECS) +int xfs_alloc_block_maxrecs(int lev, struct xfs_btree_cur *cur); +#define XFS_ALLOC_BLOCK_MAXRECS(lev,cur) xfs_alloc_block_maxrecs(lev,cur) +#else +#define XFS_ALLOC_BLOCK_MAXRECS(lev,cur) \ + ((cur)->bc_mp->m_alloc_mxr[lev != 0]) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_BLOCK_MINRECS) +int xfs_alloc_block_minrecs(int lev, struct xfs_btree_cur *cur); +#define XFS_ALLOC_BLOCK_MINRECS(lev,cur) xfs_alloc_block_minrecs(lev,cur) +#else +#define XFS_ALLOC_BLOCK_MINRECS(lev,cur) \ + ((cur)->bc_mp->m_alloc_mnr[lev != 0]) +#endif + +/* + * Minimum and maximum blocksize. + * The blocksize upper limit is pretty much arbitrary. + */ +#define XFS_MIN_BLOCKSIZE_LOG 9 /* i.e. 512 bytes */ +#define XFS_MAX_BLOCKSIZE_LOG 16 /* i.e. 65536 bytes */ +#define XFS_MIN_BLOCKSIZE (1 << XFS_MIN_BLOCKSIZE_LOG) +#define XFS_MAX_BLOCKSIZE (1 << XFS_MAX_BLOCKSIZE_LOG) + +/* + * block numbers in the AG; SB is BB 0, AGF is BB 1, AGI is BB 2, AGFL is BB 3 + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BNO_BLOCK) +xfs_agblock_t xfs_bno_block(struct xfs_mount *mp); +#define XFS_BNO_BLOCK(mp) xfs_bno_block(mp) +#else +#define XFS_BNO_BLOCK(mp) ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CNT_BLOCK) +xfs_agblock_t xfs_cnt_block(struct xfs_mount *mp); +#define XFS_CNT_BLOCK(mp) xfs_cnt_block(mp) +#else +#define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1)) +#endif + +/* + * Record, key, and pointer address macros for btree blocks. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_REC_ADDR) +xfs_alloc_rec_t *xfs_alloc_rec_addr(xfs_alloc_block_t *bb, int i, + struct xfs_btree_cur *cur); +#define XFS_ALLOC_REC_ADDR(bb,i,cur) xfs_alloc_rec_addr(bb,i,cur) +#else +#define XFS_ALLOC_REC_ADDR(bb,i,cur) \ + XFS_BTREE_REC_ADDR(XFS_ALLOC_BLOCK_SIZE(0,cur), xfs_alloc, bb, i, \ + XFS_ALLOC_BLOCK_MAXRECS(0, cur)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_KEY_ADDR) +xfs_alloc_key_t *xfs_alloc_key_addr(xfs_alloc_block_t *bb, int i, + struct xfs_btree_cur *cur); +#define XFS_ALLOC_KEY_ADDR(bb,i,cur) xfs_alloc_key_addr(bb,i,cur) +#else +#define XFS_ALLOC_KEY_ADDR(bb,i,cur) \ + XFS_BTREE_KEY_ADDR(XFS_ALLOC_BLOCK_SIZE(1,cur), xfs_alloc, bb, i, \ + XFS_ALLOC_BLOCK_MAXRECS(1, cur)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_PTR_ADDR) +xfs_alloc_ptr_t *xfs_alloc_ptr_addr(xfs_alloc_block_t *bb, int i, + struct xfs_btree_cur *cur); +#define XFS_ALLOC_PTR_ADDR(bb,i,cur) xfs_alloc_ptr_addr(bb,i,cur) +#else +#define XFS_ALLOC_PTR_ADDR(bb,i,cur) \ + XFS_BTREE_PTR_ADDR(XFS_ALLOC_BLOCK_SIZE(1,cur), xfs_alloc, bb, i, \ + XFS_ALLOC_BLOCK_MAXRECS(1, cur)) +#endif + +/* + * Prototypes for externally visible routines. + */ + +/* + * Decrement cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_alloc_decrement( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level in btree, 0 is leaf */ + int *stat); /* success/failure */ + +/* + * Delete the record pointed to by cur. + * The cursor refers to the place where the record was (could be inserted) + * when the operation returns. + */ +int /* error */ +xfs_alloc_delete( + struct xfs_btree_cur *cur, /* btree cursor */ + int *stat); /* success/failure */ + +/* + * Get the data from the pointed-to record. + */ +int /* error */ +xfs_alloc_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t *bno, /* output: starting block of extent */ + xfs_extlen_t *len, /* output: length of extent */ + int *stat); /* output: success/failure */ + +/* + * Increment cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_alloc_increment( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level in btree, 0 is leaf */ + int *stat); /* success/failure */ + +/* + * Insert the current record at the point referenced by cur. + * The cursor may be inconsistent on return if splits have been done. + */ +int /* error */ +xfs_alloc_insert( + struct xfs_btree_cur *cur, /* btree cursor */ + int *stat); /* success/failure */ + +/* + * Lookup the record equal to [bno, len] in the btree given by cur. + */ +int /* error */ +xfs_alloc_lookup_eq( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat); /* success/failure */ + +/* + * Lookup the first record greater than or equal to [bno, len] + * in the btree given by cur. + */ +int /* error */ +xfs_alloc_lookup_ge( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat); /* success/failure */ + +/* + * Lookup the first record less than or equal to [bno, len] + * in the btree given by cur. + */ +int /* error */ +xfs_alloc_lookup_le( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat); /* success/failure */ + +/* + * Update the record referred to by cur, to the value given by [bno, len]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +int /* error */ +xfs_alloc_update( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len); /* length of extent */ + +#endif /* __XFS_ALLOC_BTREE_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_arch.h linux-2.4-xfs/fs/xfs/xfs_arch.h --- linux-2.4.19/fs/xfs/xfs_arch.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_arch.h Fri Aug 23 14:44:16 2002 @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_ARCH_H__ +#define __XFS_ARCH_H__ + +#ifndef XFS_BIG_FILESYSTEMS +#error XFS_BIG_FILESYSTEMS must be defined true or false +#endif + +#ifdef __KERNEL__ + +#include + +#ifdef __LITTLE_ENDIAN +# define __BYTE_ORDER __LITTLE_ENDIAN +#endif +#ifdef __BIG_ENDIAN +# define __BYTE_ORDER __BIG_ENDIAN +#endif + +#endif /* __KERNEL__ */ + +/* do we need conversion? */ + +#define ARCH_NOCONVERT 1 +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define ARCH_CONVERT 0 +#else +#define ARCH_CONVERT ARCH_NOCONVERT +#endif + +/* generic swapping macros */ + +#define INT_SWAP16(type,var) ((typeof(type))(__swab16((__u16)(var)))) +#define INT_SWAP32(type,var) ((typeof(type))(__swab32((__u32)(var)))) +#define INT_SWAP64(type,var) ((typeof(type))(__swab64((__u64)(var)))) + +#define INT_SWAP(type, var) \ + ((sizeof(type) == 8) ? INT_SWAP64(type,var) : \ + ((sizeof(type) == 4) ? INT_SWAP32(type,var) : \ + ((sizeof(type) == 2) ? INT_SWAP16(type,var) : \ + (var)))) + +#define INT_SWAP_UNALIGNED_32(from,to) \ + { \ + ((__u8*)(to))[0] = ((__u8*)(from))[3]; \ + ((__u8*)(to))[1] = ((__u8*)(from))[2]; \ + ((__u8*)(to))[2] = ((__u8*)(from))[1]; \ + ((__u8*)(to))[3] = ((__u8*)(from))[0]; \ + } + +#define INT_SWAP_UNALIGNED_64(from,to) \ + { \ + INT_SWAP_UNALIGNED_32( ((__u8*)(from)) + 4, ((__u8*)(to))); \ + INT_SWAP_UNALIGNED_32( ((__u8*)(from)), ((__u8*)(to)) + 4); \ + } + +/* + * get and set integers from potentially unaligned locations + */ + +#define INT_GET_UNALIGNED_16_LE(pointer) \ + ((__u16)((((__u8*)(pointer))[0] ) | (((__u8*)(pointer))[1] << 8 ))) +#define INT_GET_UNALIGNED_16_BE(pointer) \ + ((__u16)((((__u8*)(pointer))[0] << 8) | (((__u8*)(pointer))[1]))) +#define INT_SET_UNALIGNED_16_LE(pointer,value) \ + { \ + ((__u8*)(pointer))[0] = (((value) ) & 0xff); \ + ((__u8*)(pointer))[1] = (((value) >> 8) & 0xff); \ + } +#define INT_SET_UNALIGNED_16_BE(pointer,value) \ + { \ + ((__u8*)(pointer))[0] = (((value) >> 8) & 0xff); \ + ((__u8*)(pointer))[1] = (((value) ) & 0xff); \ + } + +#define INT_GET_UNALIGNED_32_LE(pointer) \ + ((__u32)((((__u8*)(pointer))[0] ) | (((__u8*)(pointer))[1] << 8 ) \ + |(((__u8*)(pointer))[2] << 16) | (((__u8*)(pointer))[3] << 24))) +#define INT_GET_UNALIGNED_32_BE(pointer) \ + ((__u32)((((__u8*)(pointer))[0] << 24) | (((__u8*)(pointer))[1] << 16) \ + |(((__u8*)(pointer))[2] << 8) | (((__u8*)(pointer))[3] ))) + +#define INT_GET_UNALIGNED_64_LE(pointer) \ + (((__u64)(INT_GET_UNALIGNED_32_LE(((__u8*)(pointer))+4)) << 32 ) \ + |((__u64)(INT_GET_UNALIGNED_32_LE(((__u8*)(pointer)) )) )) +#define INT_GET_UNALIGNED_64_BE(pointer) \ + (((__u64)(INT_GET_UNALIGNED_32_BE(((__u8*)(pointer)) )) << 32 ) \ + |((__u64)(INT_GET_UNALIGNED_32_BE(((__u8*)(pointer))+4)) )) + +/* + * now pick the right ones for our MACHINE ARCHITECTURE + */ + +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define INT_GET_UNALIGNED_16(pointer) INT_GET_UNALIGNED_16_LE(pointer) +#define INT_SET_UNALIGNED_16(pointer,value) INT_SET_UNALIGNED_16_LE(pointer,value) +#define INT_GET_UNALIGNED_32(pointer) INT_GET_UNALIGNED_32_LE(pointer) +#define INT_GET_UNALIGNED_64(pointer) INT_GET_UNALIGNED_64_LE(pointer) +#else +#define INT_GET_UNALIGNED_16(pointer) INT_GET_UNALIGNED_16_BE(pointer) +#define INT_SET_UNALIGNED_16(pointer,value) INT_SET_UNALIGNED_16_BE(pointer,value) +#define INT_GET_UNALIGNED_32(pointer) INT_GET_UNALIGNED_32_BE(pointer) +#define INT_GET_UNALIGNED_64(pointer) INT_GET_UNALIGNED_64_BE(pointer) +#endif + +/* define generic INT_ macros */ + +#define INT_GET(reference,arch) \ + (((arch) == ARCH_NOCONVERT) \ + ? \ + (reference) \ + : \ + INT_SWAP((reference),(reference)) \ + ) + +/* does not return a value */ +#define INT_SET(reference,arch,valueref) \ + (__builtin_constant_p(valueref) ? \ + (void)( (reference) = ( ((arch) != ARCH_NOCONVERT) ? (INT_SWAP((reference),(valueref))) : (valueref)) ) : \ + (void)( \ + ((reference) = (valueref)), \ + ( ((arch) != ARCH_NOCONVERT) ? (reference) = INT_SWAP((reference),(reference)) : 0 ) \ + ) \ + ) + +/* does not return a value */ +#define INT_MOD_EXPR(reference,arch,code) \ + (void)(((arch) == ARCH_NOCONVERT) \ + ? \ + ((reference) code) \ + : \ + ( \ + (reference) = INT_GET((reference),arch) , \ + ((reference) code), \ + INT_SET(reference, arch, reference) \ + ) \ + ) + +/* does not return a value */ +#define INT_MOD(reference,arch,delta) \ + (void)( \ + INT_MOD_EXPR(reference,arch,+=(delta)) \ + ) + +/* + * INT_COPY - copy a value between two locations with the + * _same architecture_ but _potentially different sizes_ + * + * if the types of the two parameters are equal or they are + * in native architecture, a simple copy is done + * + * otherwise, architecture conversions are done + * + */ + +/* does not return a value */ +#define INT_COPY(dst,src,arch) \ + (void)( \ + ((sizeof(dst) == sizeof(src)) || ((arch) == ARCH_NOCONVERT)) \ + ? \ + ((dst) = (src)) \ + : \ + INT_SET(dst, arch, INT_GET(src, arch)) \ + ) + +/* + * INT_XLATE - copy a value in either direction between two locations + * with different architectures + * + * dir < 0 - copy from memory to buffer (native to arch) + * dir > 0 - copy from buffer to memory (arch to native) + */ + +/* does not return a value */ +#define INT_XLATE(buf,mem,dir,arch) {\ + ASSERT(dir); \ + if (dir>0) { \ + (mem)=INT_GET(buf, arch); \ + } else { \ + INT_SET(buf, arch, mem); \ + } \ +} + +#define INT_ISZERO(reference,arch) \ + ((reference) == 0) + +#define INT_ZERO(reference,arch) \ + ((reference) = 0) + +#define INT_GET_UNALIGNED_16_ARCH(pointer,arch) \ + ( ((arch) == ARCH_NOCONVERT) \ + ? \ + (INT_GET_UNALIGNED_16(pointer)) \ + : \ + (INT_GET_UNALIGNED_16_BE(pointer)) \ + ) +#define INT_SET_UNALIGNED_16_ARCH(pointer,value,arch) \ + if ((arch) == ARCH_NOCONVERT) { \ + INT_SET_UNALIGNED_16(pointer,value); \ + } else { \ + INT_SET_UNALIGNED_16_BE(pointer,value); \ + } + +#define DIRINO4_GET_ARCH(pointer,arch) \ + ( ((arch) == ARCH_NOCONVERT) \ + ? \ + (INT_GET_UNALIGNED_32(pointer)) \ + : \ + (INT_GET_UNALIGNED_32_BE(pointer)) \ + ) + +#if XFS_BIG_FILESYSTEMS +#define DIRINO_GET_ARCH(pointer,arch) \ + ( ((arch) == ARCH_NOCONVERT) \ + ? \ + (INT_GET_UNALIGNED_64(pointer)) \ + : \ + (INT_GET_UNALIGNED_64_BE(pointer)) \ + ) +#else +/* MACHINE ARCHITECTURE dependent */ +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define DIRINO_GET_ARCH(pointer,arch) \ + DIRINO4_GET_ARCH((((__u8*)pointer)+4),arch) +#else +#define DIRINO_GET_ARCH(pointer,arch) \ + DIRINO4_GET_ARCH(pointer,arch) +#endif +#endif + +#define DIRINO_COPY_ARCH(from,to,arch) \ + if ((arch) == ARCH_NOCONVERT) { \ + bcopy(from,to,sizeof(xfs_ino_t)); \ + } else { \ + INT_SWAP_UNALIGNED_64(from,to); \ + } +#define DIRINO4_COPY_ARCH(from,to,arch) \ + if ((arch) == ARCH_NOCONVERT) { \ + bcopy((((__u8*)from+4)),to,sizeof(xfs_dir2_ino4_t)); \ + } else { \ + INT_SWAP_UNALIGNED_32(from,to); \ + } + +#endif /* __XFS_ARCH_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_attr.c linux-2.4-xfs/fs/xfs/xfs_attr.c --- linux-2.4.19/fs/xfs/xfs_attr.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_attr.c Tue Aug 13 20:51:16 2002 @@ -0,0 +1,2294 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include + +/* + * xfs_attr.c + * + * Provide the external interfaces to manage attribute lists. + */ + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Internal routines when attribute list fits inside the inode. + */ +STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args); + +/* + * Internal routines when attribute list is one block. + */ +STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args); +STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); +STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context); + +/* + * Internal routines when attribute list is more than one block. + */ +STATIC int xfs_attr_node_addname(xfs_da_args_t *args); +STATIC int xfs_attr_node_removename(xfs_da_args_t *args); +STATIC int xfs_attr_node_list(xfs_attr_list_context_t *context); +STATIC int xfs_attr_fillstate(xfs_da_state_t *state); +STATIC int xfs_attr_refillstate(xfs_da_state_t *state); + +/* + * Routines to manipulate out-of-line attribute values. + */ +STATIC int xfs_attr_rmtval_get(xfs_da_args_t *args); +STATIC int xfs_attr_rmtval_set(xfs_da_args_t *args); +STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args); + +#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ +#define ATTR_RMTVALUE_TRANSBLKS 8 /* max # of blks in a transaction */ + +#if defined(DEBUG) +ktrace_t *xfs_attr_trace_buf; +#endif + + + +/*======================================================================== + * Overall external interface routines. + *========================================================================*/ + +/*ARGSUSED*/ +int /* error */ +xfs_attr_get(bhv_desc_t *bdp, char *name, char *value, int *valuelenp, + int flags, struct cred *cred) +{ + xfs_da_args_t args; + int error; + int namelen; + xfs_inode_t *ip = XFS_BHVTOI(bdp); + + if (!name) + return EINVAL; + ASSERT(MAXNAMELEN-1 <= 0xff); /* length is stored in uint8 */ + namelen = strlen(name); + if (namelen >= MAXNAMELEN) + return EFAULT; /* match IRIX behaviour */ + XFS_STATS_INC(xfsstats.xs_attr_get); + + if (XFS_IFORK_Q(ip) == 0) + return ENOATTR; + + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return (EIO); + + /* + * Do we answer them, or ignore them? + */ + xfs_ilock(ip, XFS_ILOCK_SHARED); + if ((error = xfs_iaccess(XFS_BHVTOI(bdp), IREAD, cred))) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return(XFS_ERROR(error)); + } + + /* + * Fill in the arg structure for this request. + */ + bzero((char *)&args, sizeof(args)); + args.name = name; + args.namelen = namelen; + args.value = value; + args.valuelen = *valuelenp; + args.flags = flags; + args.hashval = xfs_da_hashname(args.name, args.namelen); + args.dp = ip; + args.whichfork = XFS_ATTR_FORK; + args.trans = NULL; + + /* + * Decide on what work routines to call based on the inode size. + */ + if (XFS_IFORK_Q(ip) == 0 || + (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_anextents == 0)) { + error = XFS_ERROR(ENOATTR); + } else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + error = xfs_attr_shortform_getvalue(&args); + } else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_get(&args); + } else { + error = xfs_attr_node_get(&args); + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + /* + * Return the number of bytes in the value to the caller. + */ + *valuelenp = args.valuelen; + + if (error == EEXIST) + error = 0; + return(error); +} + +/*ARGSUSED*/ +int /* error */ +xfs_attr_set(bhv_desc_t *bdp, char *name, char *value, int valuelen, int flags, + struct cred *cred) +{ + xfs_da_args_t args; + xfs_inode_t *dp; + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + int error, err2, committed; + int local, size; + uint nblks; + xfs_mount_t *mp; + int rsvd = (flags & ATTR_ROOT) != 0; + int namelen; + + ASSERT(MAXNAMELEN-1 <= 0xff); /* length is stored in uint8 */ + namelen = strlen(name); + if (namelen >= MAXNAMELEN) + return EFAULT; /* match irix behaviour */ + + XFS_STATS_INC(xfsstats.xs_attr_set); + /* + * Do we answer them, or ignore them? + */ + dp = XFS_BHVTOI(bdp); + mp = dp->i_mount; + if (XFS_FORCED_SHUTDOWN(mp)) + return (EIO); + + xfs_ilock(dp, XFS_ILOCK_SHARED); + if ((error = xfs_iaccess(dp, IWRITE, cred))) { + xfs_iunlock(dp, XFS_ILOCK_SHARED); + return(XFS_ERROR(error)); + } + xfs_iunlock(dp, XFS_ILOCK_SHARED); + + /* + * Attach the dquots to the inode. + */ + if (XFS_IS_QUOTA_ON(mp)) { + if ((error = xfs_qm_dqattach(dp, 0))) + return (error); + } + + /* + * If the inode doesn't have an attribute fork, add one. + * (inode must not be locked when we call this routine) + */ + if (XFS_IFORK_Q(dp) == 0) { + error = xfs_bmap_add_attrfork(dp, rsvd); + if (error) + return(error); + } + + /* + * Fill in the arg structure for this request. + */ + bzero((char *)&args, sizeof(args)); + args.name = name; + args.namelen = namelen; + args.value = value; + args.valuelen = valuelen; + args.flags = flags; + args.hashval = xfs_da_hashname(args.name, args.namelen); + args.dp = dp; + args.firstblock = &firstblock; + args.flist = &flist; + args.whichfork = XFS_ATTR_FORK; + args.oknoent = 1; + + /* Determine space new attribute will use, and if it will be inline + * or out of line. + */ + size = xfs_attr_leaf_newentsize(&args, mp->m_sb.sb_blocksize, &local); + + nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); + if (local) { + if (size > (mp->m_sb.sb_blocksize >> 1)) { + /* Double split possible */ + nblks <<= 1; + } + } else { + uint dblocks = XFS_B_TO_FSB(mp, valuelen); + /* Out of line attribute, cannot double split, but make + * room for the attribute value itself. + */ + nblks += dblocks; + nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK); + } + + /* Size is now blocks for attribute data */ + args.total = nblks; + + /* + * Start our first transaction of the day. + * + * All future transactions during this code must be "chained" off + * this one via the trans_dup() call. All transactions will contain + * the inode, and the inode will always be marked with trans_ihold(). + * Since the inode will be locked in all transactions, we must log + * the inode in every transaction to let it float upward through + * the log. + */ + args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET); + + /* + * Root fork attributes can use reserved data blocks for this + * operation if necessary + */ + + if (rsvd) + args.trans->t_flags |= XFS_TRANS_RESERVE; + + if ((error = xfs_trans_reserve(args.trans, (uint) nblks, + XFS_ATTRSET_LOG_RES(mp, nblks), + 0, XFS_TRANS_PERM_LOG_RES, + XFS_ATTRSET_LOG_COUNT))) { + xfs_trans_cancel(args.trans, 0); + return(error); + } + xfs_ilock(dp, XFS_ILOCK_EXCL); + + if (XFS_IS_QUOTA_ON(mp)) { + if (rsvd) { + error = xfs_trans_reserve_blkquota_force(args.trans, + dp, nblks); + } else { + error = xfs_trans_reserve_blkquota(args.trans, + dp, nblks); + } + if (error) { + xfs_iunlock(dp, XFS_ILOCK_EXCL); + xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES); + return (error); + } + } + + xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args.trans, dp); + + /* + * If the attribute list is non-existant or a shortform list, + * upgrade it to a single-leaf-block attribute list. + */ + if ((dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) || + ((dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) && + (dp->i_d.di_anextents == 0))) { + + /* + * Build initial attribute list (if required). + */ + if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) + (void)xfs_attr_shortform_create(&args); + + /* + * Try to add the attr to the attribute list in + * the inode. + */ + error = xfs_attr_shortform_addname(&args); + if (error != ENOSPC) { + /* + * Commit the shortform mods, and we're done. + * NOTE: this is also the error path (EEXIST, etc). + */ + ASSERT(args.trans != NULL); + + /* + * If this is a synchronous mount, make sure that + * the transaction goes to disk before returning + * to the user. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) { + xfs_trans_set_sync(args.trans); + } + err2 = xfs_trans_commit(args.trans, + XFS_TRANS_RELEASE_LOG_RES, + NULL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + + /* + * Hit the inode change time. + */ + if (!error && (flags & ATTR_KERNOTIME) == 0) { + xfs_ichgtime(dp, XFS_ICHGTIME_CHG); + } + return(error == 0 ? err2 : error); + } + + /* + * It won't fit in the shortform, transform to a leaf block. + * GROT: another possible req'mt for a double-split btree op. + */ + XFS_BMAP_INIT(args.flist, args.firstblock); + error = xfs_attr_shortform_to_leaf(&args); + if (!error) { + error = xfs_bmap_finish(&args.trans, args.flist, + *args.firstblock, &committed); + } + if (error) { + ASSERT(committed); + args.trans = NULL; + xfs_bmap_cancel(&flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) { + xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args.trans, dp); + } + + /* + * Commit the leaf transformation. We'll need another (linked) + * transaction to add the new attribute to the leaf. + */ + if ((error = xfs_attr_rolltrans(&args.trans, dp))) + goto out; + + } + + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_addname(&args); + } else { + error = xfs_attr_node_addname(&args); + } + if (error) { + goto out; + } + + /* + * If this is a synchronous mount, make sure that the + * transaction goes to disk before returning to the user. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) { + xfs_trans_set_sync(args.trans); + } + + /* + * Commit the last in the sequence of transactions. + */ + xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); + error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES, + NULL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + + /* + * Hit the inode change time. + */ + if (!error && (flags & ATTR_KERNOTIME) == 0) { + xfs_ichgtime(dp, XFS_ICHGTIME_CHG); + } + + return(error); + +out: + if (args.trans) + xfs_trans_cancel(args.trans, + XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return(error); +} + +/* + * Generic handler routine to remove a name from an attribute list. + * Transitions attribute list from Btree to shortform as necessary. + */ +/*ARGSUSED*/ +int /* error */ +xfs_attr_remove(bhv_desc_t *bdp, char *name, int flags, struct cred *cred) +{ + xfs_da_args_t args; + xfs_inode_t *dp; + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + int error; + xfs_mount_t *mp; + int namelen; + + ASSERT(MAXNAMELEN-1<=0xff); /* length is stored in uint8 */ + namelen = strlen(name); + if (namelen>=MAXNAMELEN) + return EFAULT; /* match irix behaviour */ + + XFS_STATS_INC(xfsstats.xs_attr_remove); + + /* + * Do we answer them, or ignore them? + */ + dp = XFS_BHVTOI(bdp); + mp = dp->i_mount; + if (XFS_FORCED_SHUTDOWN(mp)) + return (EIO); + + xfs_ilock(dp, XFS_ILOCK_SHARED); + if ((error = xfs_iaccess(dp, IWRITE, cred))) { + xfs_iunlock(dp, XFS_ILOCK_SHARED); + return(XFS_ERROR(error)); + } else if (XFS_IFORK_Q(dp) == 0 || + (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + dp->i_d.di_anextents == 0)) { + xfs_iunlock(dp, XFS_ILOCK_SHARED); + return(XFS_ERROR(ENOATTR)); + } + xfs_iunlock(dp, XFS_ILOCK_SHARED); + + /* + * Fill in the arg structure for this request. + */ + bzero((char *)&args, sizeof(args)); + args.name = name; + args.namelen = namelen; + args.flags = flags; + args.hashval = xfs_da_hashname(args.name, args.namelen); + args.dp = dp; + args.firstblock = &firstblock; + args.flist = &flist; + args.total = 0; + args.whichfork = XFS_ATTR_FORK; + + /* + * Attach the dquots to the inode. + */ + if (XFS_IS_QUOTA_ON(mp)) { + if (XFS_NOT_DQATTACHED(mp, dp)) { + if ((error = xfs_qm_dqattach(dp, 0))) + return (error); + } + } + /* + * Start our first transaction of the day. + * + * All future transactions during this code must be "chained" off + * this one via the trans_dup() call. All transactions will contain + * the inode, and the inode will always be marked with trans_ihold(). + * Since the inode will be locked in all transactions, we must log + * the inode in every transaction to let it float upward through + * the log. + */ + args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM); + + /* + * Root fork attributes can use reserved data blocks for this + * operation if necessary + */ + + if (flags & ATTR_ROOT) + args.trans->t_flags |= XFS_TRANS_RESERVE; + + if ((error = xfs_trans_reserve(args.trans, + XFS_ATTRRM_SPACE_RES(mp), + XFS_ATTRRM_LOG_RES(mp), + 0, XFS_TRANS_PERM_LOG_RES, + XFS_ATTRRM_LOG_COUNT))) { + xfs_trans_cancel(args.trans, 0); + return(error); + + } + + xfs_ilock(dp, XFS_ILOCK_EXCL); + /* + * No need to make quota reservations here. We expect to release some + * blocks not allocate in the common case. + */ + xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args.trans, dp); + + /* + * Decide on what work routines to call based on the inode size. + */ + if (XFS_IFORK_Q(dp) == 0 || + (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + dp->i_d.di_anextents == 0)) { + error = XFS_ERROR(ENOATTR); + goto out; + } + if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); + error = xfs_attr_shortform_remove(&args); + if (error) { + goto out; + } + } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_removename(&args); + } else { + error = xfs_attr_node_removename(&args); + } + if (error) { + goto out; + } + + /* + * If this is a synchronous mount, make sure that the + * transaction goes to disk before returning to the user. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) { + xfs_trans_set_sync(args.trans); + } + + /* + * Commit the last in the sequence of transactions. + */ + xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); + error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES, + NULL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + + /* + * Hit the inode change time. + */ + if (!error && (flags & ATTR_KERNOTIME) == 0) { + xfs_ichgtime(dp, XFS_ICHGTIME_CHG); + } + + return(error); + +out: + if (args.trans) + xfs_trans_cancel(args.trans, + XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return(error); +} + +/* + * Generate a list of extended attribute names and optionally + * also value lengths. Positive return value follows the XFS + * convention of being an error, zero or negative return code + * is the length of the buffer returned (negated), indicating + * success. + */ +int +xfs_attr_list(bhv_desc_t *bdp, char *buffer, int bufsize, int flags, + attrlist_cursor_kern_t *cursor, struct cred *cred) +{ + xfs_attr_list_context_t context; + xfs_inode_t *dp; + int error; + + XFS_STATS_INC(xfsstats.xs_attr_list); + + /* + * Validate the cursor. + */ + if (cursor->pad1 || cursor->pad2) + return(XFS_ERROR(EINVAL)); + if ((cursor->initted == 0) && + (cursor->hashval || cursor->blkno || cursor->offset)) + return(XFS_ERROR(EINVAL)); + + /* + * Check for a properly aligned buffer. + */ + if (((long)buffer) & (sizeof(int)-1)) + return(XFS_ERROR(EFAULT)); + if (flags & ATTR_KERNOVAL) + bufsize = 0; + + /* + * Initialize the output buffer. + */ + context.dp = dp = XFS_BHVTOI(bdp); + context.cursor = cursor; + context.count = 0; + context.dupcnt = 0; + context.resynch = 1; + context.flags = flags; + if (!(flags & ATTR_KERNAMELS)) { + context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */ + context.firstu = context.bufsize; + context.alist = (attrlist_t *)buffer; + context.alist->al_count = 0; + context.alist->al_more = 0; + context.alist->al_offset[0] = context.bufsize; + } + else { + context.bufsize = bufsize; + context.firstu = context.bufsize; + context.alist = (attrlist_t *)buffer; + } + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return (EIO); + /* + * Do they have permission? + */ + xfs_ilock(dp, XFS_ILOCK_SHARED); + if ((error = xfs_iaccess(dp, IREAD, cred))) { + xfs_iunlock(dp, XFS_ILOCK_SHARED); + return(XFS_ERROR(error)); + } + + /* + * Decide on what work routines to call based on the inode size. + */ + xfs_attr_trace_l_c("syscall start", &context); + if (XFS_IFORK_Q(dp) == 0 || + (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + dp->i_d.di_anextents == 0)) { + error = 0; + } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + error = xfs_attr_shortform_list(&context); + } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_list(&context); + } else { + error = xfs_attr_node_list(&context); + } + xfs_iunlock(dp, XFS_ILOCK_SHARED); + xfs_attr_trace_l_c("syscall end", &context); + + if (!(context.flags & (ATTR_KERNOVAL|ATTR_KERNAMELS))) { + ASSERT(error >= 0); + } + else { /* must return negated buffer size or the error */ + if (context.count < 0) + error = XFS_ERROR(ERANGE); + else + error = -context.count; + } + + return(error); +} + +int /* error */ +xfs_attr_inactive(xfs_inode_t *dp) +{ + xfs_trans_t *trans; + xfs_mount_t *mp; + int error; + + mp = dp->i_mount; + ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); + + /* XXXsup - why on earth are we taking ILOCK_EXCL here??? */ + xfs_ilock(dp, XFS_ILOCK_EXCL); + if ((XFS_IFORK_Q(dp) == 0) || + (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) || + (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + dp->i_d.di_anextents == 0)) { + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return(0); + } + xfs_iunlock(dp, XFS_ILOCK_EXCL); + + /* + * Start our first transaction of the day. + * + * All future transactions during this code must be "chained" off + * this one via the trans_dup() call. All transactions will contain + * the inode, and the inode will always be marked with trans_ihold(). + * Since the inode will be locked in all transactions, we must log + * the inode in every transaction to let it float upward through + * the log. + */ + trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL); + if ((error = xfs_trans_reserve(trans, 0, XFS_ATTRINVAL_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, + XFS_ATTRINVAL_LOG_COUNT))) { + xfs_trans_cancel(trans, 0); + return(error); + } + xfs_ilock(dp, XFS_ILOCK_EXCL); + + /* + * No need to make quota reservations here. We expect to release some + * blocks, not allocate, in the common case. + */ + xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(trans, dp); + + /* + * Decide on what work routines to call based on the inode size. + */ + if ((XFS_IFORK_Q(dp) == 0) || + (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) || + (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + dp->i_d.di_anextents == 0)) { + error = 0; + goto out; + } + error = xfs_attr_root_inactive(&trans, dp); + if (error) + goto out; + /* + * signal synchronous inactive transactions unless this + * is a synchronous mount filesystem in which case we + * know that we're here because we've been called out of + * xfs_inactive which means that the last reference is gone + * and the unlink transaction has already hit the disk so + * async inactive transactions are safe. + */ + if ((error = xfs_itruncate_finish(&trans, dp, 0LL, XFS_ATTR_FORK, + (!(mp->m_flags & XFS_MOUNT_WSYNC) + ? 1 : 0)))) + goto out; + + /* + * Commit the last in the sequence of transactions. + */ + xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); + error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES, + NULL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + + return(error); + +out: + xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return(error); +} + + + +/*======================================================================== + * External routines when attribute list is inside the inode + *========================================================================*/ + +/* + * Add a name to the shortform attribute list structure + * This is the external routine. + */ +STATIC int +xfs_attr_shortform_addname(xfs_da_args_t *args) +{ + int newsize, retval; + + retval = xfs_attr_shortform_lookup(args); + if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { + return(retval); + } else if (retval == EEXIST) { + if (args->flags & ATTR_CREATE) + return(retval); + retval = xfs_attr_shortform_remove(args); + ASSERT(retval == 0); + } + + newsize = XFS_ATTR_SF_TOTSIZE(args->dp); + newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); + if ((newsize <= XFS_IFORK_ASIZE(args->dp)) && + (args->namelen < XFS_ATTR_SF_ENTSIZE_MAX) && + (args->valuelen < XFS_ATTR_SF_ENTSIZE_MAX)) { + retval = xfs_attr_shortform_add(args); + ASSERT(retval == 0); + } else { + return(XFS_ERROR(ENOSPC)); + } + return(0); +} + + +/*======================================================================== + * External routines when attribute list is one block + *========================================================================*/ + +/* + * Add a name to the leaf attribute list structure + * + * This leaf block cannot have a "remote" value, we only call this routine + * if bmap_one_block() says there is only one block (ie: no remote blks). + */ +int +xfs_attr_leaf_addname(xfs_da_args_t *args) +{ + xfs_inode_t *dp; + xfs_dabuf_t *bp; + int retval, error, committed; + + /* + * Read the (only) block in the attribute list in. + */ + dp = args->dp; + args->blkno = 0; + error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) + return(error); + ASSERT(bp != NULL); + + /* + * Look up the given attribute in the leaf block. Figure out if + * the given flags produce an error or call for an atomic rename. + */ + retval = xfs_attr_leaf_lookup_int(bp, args); + if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { + xfs_da_brelse(args->trans, bp); + return(retval); + } else if (retval == EEXIST) { + if (args->flags & ATTR_CREATE) { /* pure create op */ + xfs_da_brelse(args->trans, bp); + return(retval); + } + args->rename = 1; /* an atomic rename */ + args->blkno2 = args->blkno; /* set 2nd entry info*/ + args->index2 = args->index; + args->rmtblkno2 = args->rmtblkno; + args->rmtblkcnt2 = args->rmtblkcnt; + } + + /* + * Add the attribute to the leaf block, transitioning to a Btree + * if required. + */ + retval = xfs_attr_leaf_add(bp, args); + xfs_da_buf_done(bp); + if (retval == ENOSPC) { + /* + * Promote the attribute list to the Btree format, then + * Commit that transaction so that the node_addname() call + * can manage its own transactions. + */ + XFS_BMAP_INIT(args->flist, args->firstblock); + error = xfs_attr_leaf_to_node(args); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + *args->firstblock, &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return(error); + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) { + xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args->trans, dp); + } + + /* + * Commit the current trans (including the inode) and start + * a new one. + */ + if ((error = xfs_attr_rolltrans(&args->trans, dp))) + return (error); + + /* + * Fob the whole rest of the problem off on the Btree code. + */ + error = xfs_attr_node_addname(args); + return(error); + } + + /* + * Commit the transaction that added the attr name so that + * later routines can manage their own transactions. + */ + if ((error = xfs_attr_rolltrans(&args->trans, dp))) + return (error); + + /* + * If there was an out-of-line value, allocate the blocks we + * identified for its storage and copy the value. This is done + * after we create the attribute so that we don't overflow the + * maximum size of a transaction and/or hit a deadlock. + */ + if (args->rmtblkno > 0) { + error = xfs_attr_rmtval_set(args); + if (error) + return(error); + } + + /* + * If this is an atomic rename operation, we must "flip" the + * incomplete flags on the "new" and "old" attribute/value pairs + * so that one disappears and one appears atomically. Then we + * must remove the "old" attribute/value pair. + */ + if (args->rename) { + /* + * In a separate transaction, set the incomplete flag on the + * "old" attr and clear the incomplete flag on the "new" attr. + */ + error = xfs_attr_leaf_flipflags(args); + if (error) + return(error); + + /* + * Dismantle the "old" attribute/value pair by removing + * a "remote" value (if it exists). + */ + args->index = args->index2; + args->blkno = args->blkno2; + args->rmtblkno = args->rmtblkno2; + args->rmtblkcnt = args->rmtblkcnt2; + if (args->rmtblkno) { + error = xfs_attr_rmtval_remove(args); + if (error) + return(error); + } + + /* + * Read in the block containing the "old" attr, then + * remove the "old" attr from that block (neat, huh!) + */ + error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, + &bp, XFS_ATTR_FORK); + if (error) + return(error); + ASSERT(bp != NULL); + (void)xfs_attr_leaf_remove(bp, args); + + /* + * If the result is small enough, shrink it all into the inode. + */ + if (xfs_attr_shortform_allfit(bp, dp)) { + XFS_BMAP_INIT(args->flist, args->firstblock); + error = xfs_attr_leaf_to_shortform(bp, args); + /* bp is gone due to xfs_da_shrink_inode */ + if (!error) { + error = xfs_bmap_finish(&args->trans, + args->flist, + *args->firstblock, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return(error); + } + + /* + * bmap_finish() may have committed the last trans + * and started a new one. We need the inode to be + * in all transactions. + */ + if (committed) { + xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args->trans, dp); + } + } else + xfs_da_buf_done(bp); + + /* + * Commit the remove and start the next trans in series. + */ + error = xfs_attr_rolltrans(&args->trans, dp); + + } else if (args->rmtblkno > 0) { + /* + * Added a "remote" value, just clear the incomplete flag. + */ + error = xfs_attr_leaf_clearflag(args); + } + return(error); +} + +/* + * Remove a name from the leaf attribute list structure + * + * This leaf block cannot have a "remote" value, we only call this routine + * if bmap_one_block() says there is only one block (ie: no remote blks). + */ +STATIC int +xfs_attr_leaf_removename(xfs_da_args_t *args) +{ + xfs_inode_t *dp; + xfs_dabuf_t *bp; + int committed; + int error; + + /* + * Remove the attribute. + */ + dp = args->dp; + args->blkno = 0; + error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) { + return(error); + } + + ASSERT(bp != NULL); + error = xfs_attr_leaf_lookup_int(bp, args); + if (error == ENOATTR) { + xfs_da_brelse(args->trans, bp); + return(error); + } + + (void)xfs_attr_leaf_remove(bp, args); + + /* + * If the result is small enough, shrink it all into the inode. + */ + if (xfs_attr_shortform_allfit(bp, dp)) { + XFS_BMAP_INIT(args->flist, args->firstblock); + error = xfs_attr_leaf_to_shortform(bp, args); + /* bp is gone due to xfs_da_shrink_inode */ + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + *args->firstblock, &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return(error); + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) { + xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args->trans, dp); + } + } else + xfs_da_buf_done(bp); + return(0); +} + +/* + * Look up a name in a leaf attribute list structure. + * + * This leaf block cannot have a "remote" value, we only call this routine + * if bmap_one_block() says there is only one block (ie: no remote blks). + */ +int +xfs_attr_leaf_get(xfs_da_args_t *args) +{ + xfs_dabuf_t *bp; + int error; + + args->blkno = 0; + error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) + return(error); + ASSERT(bp != NULL); + + error = xfs_attr_leaf_lookup_int(bp, args); + if (error != EEXIST) { + xfs_da_brelse(args->trans, bp); + return(error); + } + error = xfs_attr_leaf_getvalue(bp, args); + xfs_da_brelse(args->trans, bp); + if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) { + error = xfs_attr_rmtval_get(args); + } + return(error); +} + +/* + * Copy out attribute entries for attr_list(), for leaf attribute lists. + */ +STATIC int +xfs_attr_leaf_list(xfs_attr_list_context_t *context) +{ + xfs_attr_leafblock_t *leaf; + int error; + xfs_dabuf_t *bp; + + context->cursor->blkno = 0; + error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK); + if (error) + return(error); + ASSERT(bp != NULL); + leaf = bp->data; + if (INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + != XFS_ATTR_LEAF_MAGIC) { + xfs_da_brelse(NULL, bp); + return(XFS_ERROR(EFSCORRUPTED)); + } + + (void)xfs_attr_leaf_list_int(bp, context); + xfs_da_brelse(NULL, bp); + return(0); +} + + +/*======================================================================== + * External routines when attribute list size > XFS_LBSIZE(mp). + *========================================================================*/ + +/* + * Add a name to a Btree-format attribute list. + * + * This will involve walking down the Btree, and may involve splitting + * leaf nodes and even splitting intermediate nodes up to and including + * the root node (a special case of an intermediate node). + * + * "Remote" attribute values confuse the issue and atomic rename operations + * add a whole extra layer of confusion on top of that. + */ +STATIC int +xfs_attr_node_addname(xfs_da_args_t *args) +{ + xfs_da_state_t *state; + xfs_da_state_blk_t *blk; + xfs_inode_t *dp; + xfs_mount_t *mp; + int committed, retval, error; + + /* + * Fill in bucket of arguments/results/context to carry around. + */ + dp = args->dp; + mp = dp->i_mount; +restart: + state = xfs_da_state_alloc(); + state->args = args; + state->mp = mp; + state->blocksize = state->mp->m_sb.sb_blocksize; + + /* + * Search to see if name already exists, and get back a pointer + * to where it should go. + */ + error = xfs_da_node_lookup_int(state, &retval); + if (error) + goto out; + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { + goto out; + } else if (retval == EEXIST) { + if (args->flags & ATTR_CREATE) + goto out; + args->rename = 1; /* atomic rename op */ + args->blkno2 = args->blkno; /* set 2nd entry info*/ + args->index2 = args->index; + args->rmtblkno2 = args->rmtblkno; + args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtblkno = 0; + args->rmtblkcnt = 0; + } + + retval = xfs_attr_leaf_add(blk->bp, state->args); + if (retval == ENOSPC) { + if (state->path.active == 1) { + /* + * Its really a single leaf node, but it had + * out-of-line values so it looked like it *might* + * have been a b-tree. + */ + xfs_da_state_free(state); + XFS_BMAP_INIT(args->flist, args->firstblock); + error = xfs_attr_leaf_to_node(args); + if (!error) { + error = xfs_bmap_finish(&args->trans, + args->flist, + *args->firstblock, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans + * and started a new one. We need the inode to be + * in all transactions. + */ + if (committed) { + xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args->trans, dp); + } + + /* + * Commit the node conversion and start the next + * trans in the chain. + */ + if ((error = xfs_attr_rolltrans(&args->trans, dp))) + goto out; + + goto restart; + } + + /* + * Split as many Btree elements as required. + * This code tracks the new and old attr's location + * in the index/blkno/rmtblkno/rmtblkcnt fields and + * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields. + */ + XFS_BMAP_INIT(args->flist, args->firstblock); + error = xfs_da_split(state); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + *args->firstblock, &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) { + xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args->trans, dp); + } + } else { + /* + * Addition succeeded, update Btree hashvals. + */ + xfs_da_fixhashpath(state, &state->path); + } + + /* + * Kill the state structure, we're done with it and need to + * allow the buffers to come back later. + */ + xfs_da_state_free(state); + state = NULL; + + /* + * Commit the leaf addition or btree split and start the next + * trans in the chain. + */ + if ((error = xfs_attr_rolltrans(&args->trans, dp))) + goto out; + + /* + * If there was an out-of-line value, allocate the blocks we + * identified for its storage and copy the value. This is done + * after we create the attribute so that we don't overflow the + * maximum size of a transaction and/or hit a deadlock. + */ + if (args->rmtblkno > 0) { + error = xfs_attr_rmtval_set(args); + if (error) + return(error); + } + + /* + * If this is an atomic rename operation, we must "flip" the + * incomplete flags on the "new" and "old" attribute/value pairs + * so that one disappears and one appears atomically. Then we + * must remove the "old" attribute/value pair. + */ + if (args->rename) { + /* + * In a separate transaction, set the incomplete flag on the + * "old" attr and clear the incomplete flag on the "new" attr. + */ + error = xfs_attr_leaf_flipflags(args); + if (error) + goto out; + + /* + * Dismantle the "old" attribute/value pair by removing + * a "remote" value (if it exists). + */ + args->index = args->index2; + args->blkno = args->blkno2; + args->rmtblkno = args->rmtblkno2; + args->rmtblkcnt = args->rmtblkcnt2; + if (args->rmtblkno) { + error = xfs_attr_rmtval_remove(args); + if (error) + return(error); + } + + /* + * Re-find the "old" attribute entry after any split ops. + * The INCOMPLETE flag means that we will find the "old" + * attr, not the "new" one. + */ + args->flags |= XFS_ATTR_INCOMPLETE; + state = xfs_da_state_alloc(); + state->args = args; + state->mp = mp; + state->blocksize = state->mp->m_sb.sb_blocksize; + state->inleaf = 0; + error = xfs_da_node_lookup_int(state, &retval); + if (error) + goto out; + + /* + * Remove the name and update the hashvals in the tree. + */ + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + error = xfs_attr_leaf_remove(blk->bp, args); + xfs_da_fixhashpath(state, &state->path); + + /* + * Check to see if the tree needs to be collapsed. + */ + if (retval && (state->path.active > 1)) { + XFS_BMAP_INIT(args->flist, args->firstblock); + error = xfs_da_join(state); + if (!error) { + error = xfs_bmap_finish(&args->trans, + args->flist, + *args->firstblock, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans + * and started a new one. We need the inode to be + * in all transactions. + */ + if (committed) { + xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args->trans, dp); + } + } + + /* + * Commit and start the next trans in the chain. + */ + if ((error = xfs_attr_rolltrans(&args->trans, dp))) + goto out; + + } else if (args->rmtblkno > 0) { + /* + * Added a "remote" value, just clear the incomplete flag. + */ + error = xfs_attr_leaf_clearflag(args); + if (error) + goto out; + } + retval = error = 0; + +out: + if (state) + xfs_da_state_free(state); + if (error) + return(error); + return(retval); +} + +/* + * Remove a name from a B-tree attribute list. + * + * This will involve walking down the Btree, and may involve joining + * leaf nodes and even joining intermediate nodes up to and including + * the root node (a special case of an intermediate node). + */ +STATIC int +xfs_attr_node_removename(xfs_da_args_t *args) +{ + xfs_da_state_t *state; + xfs_da_state_blk_t *blk; + xfs_inode_t *dp; + xfs_dabuf_t *bp; + int retval, error, committed; + + /* + * Tie a string around our finger to remind us where we are. + */ + dp = args->dp; + state = xfs_da_state_alloc(); + state->args = args; + state->mp = dp->i_mount; + state->blocksize = state->mp->m_sb.sb_blocksize; + + /* + * Search to see if name exists, and get back a pointer to it. + */ + error = xfs_da_node_lookup_int(state, &retval); + if (error || (retval != EEXIST)) { + if (error == 0) + error = retval; + goto out; + } + + /* + * If there is an out-of-line value, de-allocate the blocks. + * This is done before we remove the attribute so that we don't + * overflow the maximum size of a transaction and/or hit a deadlock. + */ + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->bp != NULL); + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + if (args->rmtblkno > 0) { + /* + * Fill in disk block numbers in the state structure + * so that we can get the buffers back after we commit + * several transactions in the following calls. + */ + error = xfs_attr_fillstate(state); + if (error) + goto out; + + /* + * Mark the attribute as INCOMPLETE, then bunmapi() the + * remote value. + */ + error = xfs_attr_leaf_setflag(args); + if (error) + goto out; + error = xfs_attr_rmtval_remove(args); + if (error) + goto out; + + /* + * Refill the state structure with buffers, the prior calls + * released our buffers. + */ + error = xfs_attr_refillstate(state); + if (error) + goto out; + } + + /* + * Remove the name and update the hashvals in the tree. + */ + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + retval = xfs_attr_leaf_remove(blk->bp, args); + xfs_da_fixhashpath(state, &state->path); + + /* + * Check to see if the tree needs to be collapsed. + */ + if (retval && (state->path.active > 1)) { + XFS_BMAP_INIT(args->flist, args->firstblock); + error = xfs_da_join(state); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + *args->firstblock, &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) { + xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args->trans, dp); + } + + /* + * Commit the Btree join operation and start a new trans. + */ + if ((error = xfs_attr_rolltrans(&args->trans, dp))) + goto out; + } + + /* + * If the result is small enough, push it all into the inode. + */ + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + /* + * Have to get rid of the copy of this dabuf in the state. + */ + ASSERT(state->path.active == 1); + ASSERT(state->path.blk[0].bp); + xfs_da_buf_done(state->path.blk[0].bp); + state->path.blk[0].bp = NULL; + + error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, + XFS_ATTR_FORK); + if (error) + goto out; + ASSERT(INT_GET(((xfs_attr_leafblock_t *) + bp->data)->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + + if (xfs_attr_shortform_allfit(bp, dp)) { + XFS_BMAP_INIT(args->flist, args->firstblock); + error = xfs_attr_leaf_to_shortform(bp, args); + /* bp is gone due to xfs_da_shrink_inode */ + if (!error) { + error = xfs_bmap_finish(&args->trans, + args->flist, + *args->firstblock, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans + * and started a new one. We need the inode to be + * in all transactions. + */ + if (committed) { + xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args->trans, dp); + } + } else + xfs_da_brelse(args->trans, bp); + } + error = 0; + +out: + xfs_da_state_free(state); + return(error); +} + +/* + * Fill in the disk block numbers in the state structure for the buffers + * that are attached to the state structure. + * This is done so that we can quickly reattach ourselves to those buffers + * after some set of transaction commit's has released these buffers. + */ +STATIC int +xfs_attr_fillstate(xfs_da_state_t *state) +{ + xfs_da_state_path_t *path; + xfs_da_state_blk_t *blk; + int level; + + /* + * Roll down the "path" in the state structure, storing the on-disk + * block number for those buffers in the "path". + */ + path = &state->path; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->bp) { + blk->disk_blkno = xfs_da_blkno(blk->bp); + xfs_da_buf_done(blk->bp); + blk->bp = NULL; + } else { + blk->disk_blkno = 0; + } + } + + /* + * Roll down the "altpath" in the state structure, storing the on-disk + * block number for those buffers in the "altpath". + */ + path = &state->altpath; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->bp) { + blk->disk_blkno = xfs_da_blkno(blk->bp); + xfs_da_buf_done(blk->bp); + blk->bp = NULL; + } else { + blk->disk_blkno = 0; + } + } + + return(0); +} + +/* + * Reattach the buffers to the state structure based on the disk block + * numbers stored in the state structure. + * This is done after some set of transaction commit's has released those + * buffers from our grip. + */ +STATIC int +xfs_attr_refillstate(xfs_da_state_t *state) +{ + xfs_da_state_path_t *path; + xfs_da_state_blk_t *blk; + int level, error; + + /* + * Roll down the "path" in the state structure, storing the on-disk + * block number for those buffers in the "path". + */ + path = &state->path; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->disk_blkno) { + error = xfs_da_read_buf(state->args->trans, + state->args->dp, + blk->blkno, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); + if (error) + return(error); + } else { + blk->bp = NULL; + } + } + + /* + * Roll down the "altpath" in the state structure, storing the on-disk + * block number for those buffers in the "altpath". + */ + path = &state->altpath; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->disk_blkno) { + error = xfs_da_read_buf(state->args->trans, + state->args->dp, + blk->blkno, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); + if (error) + return(error); + } else { + blk->bp = NULL; + } + } + + return(0); +} + +/* + * Look up a filename in a node attribute list. + * + * This routine gets called for any attribute fork that has more than one + * block, ie: both true Btree attr lists and for single-leaf-blocks with + * "remote" values taking up more blocks. + */ +int +xfs_attr_node_get(xfs_da_args_t *args) +{ + xfs_da_state_t *state; + xfs_da_state_blk_t *blk; + int error, retval; + int i; + + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + state->blocksize = state->mp->m_sb.sb_blocksize; + + /* + * Search to see if name exists, and get back a pointer to it. + */ + error = xfs_da_node_lookup_int(state, &retval); + if (error) { + retval = error; + } else if (retval == EEXIST) { + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->bp != NULL); + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + + /* + * Get the value, local or "remote" + */ + retval = xfs_attr_leaf_getvalue(blk->bp, args); + if (!retval && (args->rmtblkno > 0) + && !(args->flags & ATTR_KERNOVAL)) { + retval = xfs_attr_rmtval_get(args); + } + } + + /* + * If not in a transaction, we have to release all the buffers. + */ + for (i = 0; i < state->path.active; i++) { + xfs_da_brelse(args->trans, state->path.blk[i].bp); + state->path.blk[i].bp = NULL; + } + + xfs_da_state_free(state); + return(retval); +} + +STATIC int /* error */ +xfs_attr_node_list(xfs_attr_list_context_t *context) +{ + attrlist_cursor_kern_t *cursor; + xfs_attr_leafblock_t *leaf; + xfs_da_intnode_t *node; + xfs_da_node_entry_t *btree; + int error, i; + xfs_dabuf_t *bp; + + cursor = context->cursor; + cursor->initted = 1; + + /* + * Do all sorts of validation on the passed-in cursor structure. + * If anything is amiss, ignore the cursor and look up the hashval + * starting from the btree root. + */ + bp = NULL; + if (cursor->blkno > 0) { + error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, + &bp, XFS_ATTR_FORK); + if ((error != 0) && (error != EFSCORRUPTED)) + return(error); + if (bp) { + node = bp->data; + switch (INT_GET(node->hdr.info.magic, ARCH_CONVERT)) { + case XFS_DA_NODE_MAGIC: + xfs_attr_trace_l_cn("wrong blk", context, node); + xfs_da_brelse(NULL, bp); + bp = NULL; + break; + case XFS_ATTR_LEAF_MAGIC: + leaf = bp->data; + if (cursor->hashval > + INT_GET(leaf->entries[ + INT_GET(leaf->hdr.count, + ARCH_CONVERT)-1].hashval, + ARCH_CONVERT)) { + xfs_attr_trace_l_cl("wrong blk", + context, leaf); + xfs_da_brelse(NULL, bp); + bp = NULL; + } else if (cursor->hashval <= + INT_GET(leaf->entries[0].hashval, + ARCH_CONVERT)) { + xfs_attr_trace_l_cl("maybe wrong blk", + context, leaf); + xfs_da_brelse(NULL, bp); + bp = NULL; + } + break; + default: + xfs_attr_trace_l_c("wrong blk - ??", context); + xfs_da_brelse(NULL, bp); + bp = NULL; + } + } + } + + /* + * We did not find what we expected given the cursor's contents, + * so we start from the top and work down based on the hash value. + * Note that start of node block is same as start of leaf block. + */ + if (bp == NULL) { + cursor->blkno = 0; + for (;;) { + error = xfs_da_read_buf(NULL, context->dp, + cursor->blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) + return(error); + if (bp == NULL) + return(XFS_ERROR(EFSCORRUPTED)); + node = bp->data; + if (INT_GET(node->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC) + break; + if (INT_GET(node->hdr.info.magic, ARCH_CONVERT) + != XFS_DA_NODE_MAGIC) { + xfs_da_brelse(NULL, bp); + return(XFS_ERROR(EFSCORRUPTED)); + } + btree = node->btree; + for (i = 0; + i < INT_GET(node->hdr.count, ARCH_CONVERT); + btree++, i++) { + if (cursor->hashval + <= INT_GET(btree->hashval, + ARCH_CONVERT)) { + cursor->blkno = INT_GET(btree->before, ARCH_CONVERT); + xfs_attr_trace_l_cb("descending", + context, btree); + break; + } + } + if (i == INT_GET(node->hdr.count, ARCH_CONVERT)) { + xfs_da_brelse(NULL, bp); + return(0); + } + xfs_da_brelse(NULL, bp); + } + } + ASSERT(bp != NULL); + + /* + * Roll upward through the blocks, processing each leaf block in + * order. As long as there is space in the result buffer, keep + * adding the information. + */ + for (;;) { + leaf = bp->data; + if (INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + != XFS_ATTR_LEAF_MAGIC) { + xfs_da_brelse(NULL, bp); + return(XFS_ERROR(EFSCORRUPTED)); + } + error = xfs_attr_leaf_list_int(bp, context); + if (error || (INT_ISZERO(leaf->hdr.info.forw, ARCH_CONVERT))) + break; /* not really an error, buffer full or EOF */ + cursor->blkno = INT_GET(leaf->hdr.info.forw, ARCH_CONVERT); + xfs_da_brelse(NULL, bp); + error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, + &bp, XFS_ATTR_FORK); + if (error) + return(error); + if (bp == NULL) + return(XFS_ERROR(EFSCORRUPTED)); + } + xfs_da_brelse(NULL, bp); + return(0); +} + + +/*======================================================================== + * External routines for manipulating out-of-line attribute values. + *========================================================================*/ + +/* + * Read the value associated with an attribute from the out-of-line buffer + * that we stored it in. + */ +STATIC int +xfs_attr_rmtval_get(xfs_da_args_t *args) +{ + xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE]; + xfs_mount_t *mp; + xfs_daddr_t dblkno; + xfs_caddr_t dst; + xfs_buf_t *bp; + int nmap, error, tmp, valuelen, blkcnt, i; + xfs_dablk_t lblkno; + + ASSERT(!(args->flags & ATTR_KERNOVAL)); + + mp = args->dp->i_mount; + dst = args->value; + valuelen = args->valuelen; + lblkno = args->rmtblkno; + while (valuelen > 0) { + nmap = ATTR_RMTVALUE_MAPSIZE; + error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + NULL, 0, map, &nmap, NULL); + if (error) + return(error); + ASSERT(nmap >= 1); + + for (i = 0; (i < nmap) && (valuelen > 0); i++) { + ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) && + (map[i].br_startblock != HOLESTARTBLOCK)); + dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); + blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); + error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno, + blkcnt, XFS_BUF_LOCK, &bp); + if (error) + return(error); + + tmp = (valuelen < XFS_BUF_SIZE(bp)) + ? valuelen : XFS_BUF_SIZE(bp); + xfs_biomove(bp, 0, tmp, dst, XFS_B_READ); + xfs_buf_relse(bp); + dst += tmp; + valuelen -= tmp; + + lblkno += map[i].br_blockcount; + } + } + ASSERT(valuelen == 0); + return(0); +} + +/* + * Write the value associated with an attribute into the out-of-line buffer + * that we have defined for it. + */ +STATIC int +xfs_attr_rmtval_set(xfs_da_args_t *args) +{ + xfs_mount_t *mp; + xfs_fileoff_t lfileoff; + xfs_inode_t *dp; + xfs_bmbt_irec_t map; + xfs_daddr_t dblkno; + xfs_caddr_t src; + xfs_buf_t *bp; + xfs_dablk_t lblkno; + int blkcnt, valuelen, nmap, error, tmp, committed; + + dp = args->dp; + mp = dp->i_mount; + src = args->value; + + /* + * Find a "hole" in the attribute address space large enough for + * us to drop the new attribute's value into. + */ + blkcnt = XFS_B_TO_FSB(mp, args->valuelen); + lfileoff = 0; + error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, + XFS_ATTR_FORK); + if (error) { + return(error); + } + args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; + args->rmtblkcnt = blkcnt; + + /* + * Roll through the "value", allocating blocks on disk as required. + */ + while (blkcnt > 0) { + /* + * Allocate a single extent, up to the size of the value. + */ + XFS_BMAP_INIT(args->flist, args->firstblock); + nmap = 1; + error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno, + blkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA | + XFS_BMAPI_WRITE, + args->firstblock, args->total, &map, &nmap, + args->flist); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + *args->firstblock, &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return(error); + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) { + xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args->trans, dp); + } + + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; + + /* + * Start the next trans in the chain. + */ + if ((error = xfs_attr_rolltrans(&args->trans, dp))) + return (error); + } + + /* + * Roll through the "value", copying the attribute value to the + * already-allocated blocks. Blocks are written synchronously + * so that we can know they are all on disk before we turn off + * the INCOMPLETE flag. + */ + lblkno = args->rmtblkno; + valuelen = args->valuelen; + while (valuelen > 0) { + /* + * Try to remember where we decided to put the value. + */ + XFS_BMAP_INIT(args->flist, args->firstblock); + nmap = 1; + error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + args->firstblock, 0, &map, &nmap, NULL); + if (error) { + return(error); + } + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + + dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), + blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + + bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, + blkcnt, XFS_BUF_LOCK); + ASSERT(bp); + ASSERT(!XFS_BUF_GETERROR(bp)); + + tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : + XFS_BUF_SIZE(bp); + xfs_biomove(bp, 0, tmp, src, XFS_B_WRITE); + if (tmp < XFS_BUF_SIZE(bp)) + xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); + if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */ + return (error); + } + src += tmp; + valuelen -= tmp; + + lblkno += map.br_blockcount; + } + ASSERT(valuelen == 0); + return(0); +} + +/* + * Remove the value associated with an attribute by deleting the + * out-of-line buffer that it is stored on. + */ +STATIC int +xfs_attr_rmtval_remove(xfs_da_args_t *args) +{ + xfs_mount_t *mp; + xfs_bmbt_irec_t map; + xfs_buf_t *bp; + xfs_daddr_t dblkno; + xfs_dablk_t lblkno; + int valuelen, blkcnt, nmap, error, done, committed; + + mp = args->dp->i_mount; + + /* + * Roll through the "value", invalidating the attribute value's + * blocks. + */ + lblkno = args->rmtblkno; + valuelen = args->rmtblkcnt; + while (valuelen > 0) { + /* + * Try to remember where we decided to put the value. + */ + XFS_BMAP_INIT(args->flist, args->firstblock); + nmap = 1; + error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + args->firstblock, 0, &map, &nmap, + args->flist); + if (error) { + return(error); + } + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + + dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), + blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + + /* + * If the "remote" value is in the cache, remove it. + */ + /* bp = incore(mp->m_dev, dblkno, blkcnt, 1); */ + bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, 1); + if (bp) { + XFS_BUF_STALE(bp); + XFS_BUF_UNDELAYWRITE(bp); + xfs_buf_relse(bp); + bp = NULL; + } + + valuelen -= map.br_blockcount; + + lblkno += map.br_blockcount; + } + + /* + * Keep de-allocating extents until the remote-value region is gone. + */ + lblkno = args->rmtblkno; + blkcnt = args->rmtblkcnt; + done = 0; + while (!done) { + XFS_BMAP_INIT(args->flist, args->firstblock); + error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + 1, args->firstblock, args->flist, &done); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + *args->firstblock, &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return(error); + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) { + xfs_trans_ijoin(args->trans, args->dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args->trans, args->dp); + } + + /* + * Close out trans and start the next one in the chain. + */ + if ((error = xfs_attr_rolltrans(&args->trans, args->dp))) + return (error); + } + return(0); +} + +#if defined(XFS_ATTR_TRACE) +/* + * Add a trace buffer entry for an attr_list context structure. + */ +void +xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context) +{ + xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_C, where, + (__psunsigned_t)context->dp, + (__psunsigned_t)context->cursor->hashval, + (__psunsigned_t)context->cursor->blkno, + (__psunsigned_t)context->cursor->offset, + (__psunsigned_t)context->alist, + (__psunsigned_t)context->bufsize, + (__psunsigned_t)context->count, + (__psunsigned_t)context->firstu, + (__psunsigned_t) + (context->count > 0) + ? (ATTR_ENTRY(context->alist, + context->count-1)->a_valuelen) + : 0, + (__psunsigned_t)context->dupcnt, + (__psunsigned_t)context->flags, + (__psunsigned_t)NULL, + (__psunsigned_t)NULL, + (__psunsigned_t)NULL); +} + +/* + * Add a trace buffer entry for a context structure and a Btree node. + */ +void +xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context, + struct xfs_da_intnode *node) +{ + xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CN, where, + (__psunsigned_t)context->dp, + (__psunsigned_t)context->cursor->hashval, + (__psunsigned_t)context->cursor->blkno, + (__psunsigned_t)context->cursor->offset, + (__psunsigned_t)context->alist, + (__psunsigned_t)context->bufsize, + (__psunsigned_t)context->count, + (__psunsigned_t)context->firstu, + (__psunsigned_t) + (context->count > 0) + ? (ATTR_ENTRY(context->alist, + context->count-1)->a_valuelen) + : 0, + (__psunsigned_t)context->dupcnt, + (__psunsigned_t)context->flags, + (__psunsigned_t)INT_GET(node->hdr.count, ARCH_CONVERT), + (__psunsigned_t)INT_GET(node->btree[0].hashval, ARCH_CONVERT), + (__psunsigned_t)INT_GET(node->btree[INT_GET(node->hdr.count, ARCH_CONVERT)-1].hashval, ARCH_CONVERT)); +} + +/* + * Add a trace buffer entry for a context structure and a Btree element. + */ +void +xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context, + struct xfs_da_node_entry *btree) +{ + xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CB, where, + (__psunsigned_t)context->dp, + (__psunsigned_t)context->cursor->hashval, + (__psunsigned_t)context->cursor->blkno, + (__psunsigned_t)context->cursor->offset, + (__psunsigned_t)context->alist, + (__psunsigned_t)context->bufsize, + (__psunsigned_t)context->count, + (__psunsigned_t)context->firstu, + (__psunsigned_t) + (context->count > 0) + ? (ATTR_ENTRY(context->alist, + context->count-1)->a_valuelen) + : 0, + (__psunsigned_t)context->dupcnt, + (__psunsigned_t)context->flags, + (__psunsigned_t)INT_GET(btree->hashval, ARCH_CONVERT), + (__psunsigned_t)INT_GET(btree->before, ARCH_CONVERT), + (__psunsigned_t)NULL); +} + +/* + * Add a trace buffer entry for a context structure and a leaf block. + */ +void +xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context, + struct xfs_attr_leafblock *leaf) +{ + xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CL, where, + (__psunsigned_t)context->dp, + (__psunsigned_t)context->cursor->hashval, + (__psunsigned_t)context->cursor->blkno, + (__psunsigned_t)context->cursor->offset, + (__psunsigned_t)context->alist, + (__psunsigned_t)context->bufsize, + (__psunsigned_t)context->count, + (__psunsigned_t)context->firstu, + (__psunsigned_t) + (context->count > 0) + ? (ATTR_ENTRY(context->alist, + context->count-1)->a_valuelen) + : 0, + (__psunsigned_t)context->dupcnt, + (__psunsigned_t)context->flags, + (__psunsigned_t)INT_GET(leaf->hdr.count, ARCH_CONVERT), + (__psunsigned_t)INT_GET(leaf->entries[0].hashval, ARCH_CONVERT), + (__psunsigned_t)INT_GET(leaf->entries[INT_GET(leaf->hdr.count, ARCH_CONVERT)-1].hashval, ARCH_CONVERT)); +} + +/* + * Add a trace buffer entry for the arguments given to the routine, + * generic form. + */ +void +xfs_attr_trace_enter(int type, char *where, + __psunsigned_t a2, __psunsigned_t a3, + __psunsigned_t a4, __psunsigned_t a5, + __psunsigned_t a6, __psunsigned_t a7, + __psunsigned_t a8, __psunsigned_t a9, + __psunsigned_t a10, __psunsigned_t a11, + __psunsigned_t a12, __psunsigned_t a13, + __psunsigned_t a14, __psunsigned_t a15) +{ + ASSERT(xfs_attr_trace_buf); + ktrace_enter(xfs_attr_trace_buf, (void *)((__psunsigned_t)type), + (void *)where, + (void *)a2, (void *)a3, (void *)a4, + (void *)a5, (void *)a6, (void *)a7, + (void *)a8, (void *)a9, (void *)a10, + (void *)a11, (void *)a12, (void *)a13, + (void *)a14, (void *)a15); +} +#endif /* XFS_ATTR_TRACE */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_attr.h linux-2.4-xfs/fs/xfs/xfs_attr.h --- linux-2.4.19/fs/xfs/xfs_attr.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_attr.h Wed Jul 10 23:13:50 2002 @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2000, 2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_ATTR_H__ +#define __XFS_ATTR_H__ + +/* + * xfs_attr.h + * + * Large attribute lists are structured around Btrees where all the data + * elements are in the leaf nodes. Attribute names are hashed into an int, + * then that int is used as the index into the Btree. Since the hashval + * of an attribute name may not be unique, we may have duplicate keys. + * The internal links in the Btree are logical block offsets into the file. + * + * Small attribute lists use a different format and are packed as tightly + * as possible so as to fit into the literal area of the inode. + */ + +#ifdef XFS_ALL_TRACE +#define XFS_ATTR_TRACE +#endif + +#if !defined(DEBUG) +#undef XFS_ATTR_TRACE +#endif + + +/*======================================================================== + * External interfaces + *========================================================================*/ + +#define ATTR_ROOT 0x0002 /* use attrs in root namespace, not user */ +#define ATTR_CREATE 0x0010 /* pure create: fail if attr already exists */ +#define ATTR_REPLACE 0x0020 /* pure set: fail if attr does not exist */ +#define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */ +#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ +#define ATTR_KERNAMELS 0x4000 /* [kernel] list attr names (simple list) */ +#define ATTR_KERNFULLS 0x8000 /* [kernel] full attr list, ie. root+user */ + +/* + * The maximum size (into the kernel or returned from the kernel) of an + * attribute value or the buffer used for an attr_list() call. Larger + * sizes will result in an E2BIG return code. + */ +#define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */ + +/* + * Define how lists of attribute names are returned to the user from + * the attr_list() call. A large, 32bit aligned, buffer is passed in + * along with its size. We put an array of offsets at the top that each + * reference an attrlist_ent_t and pack the attrlist_ent_t's at the bottom. + */ +typedef struct attrlist { + __s32 al_count; /* number of entries in attrlist */ + __s32 al_more; /* T/F: more attrs (do call again) */ + __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */ +} attrlist_t; + +/* + * Show the interesting info about one attribute. This is what the + * al_offset[i] entry points to. + */ +typedef struct attrlist_ent { /* data from attr_list() */ + __u32 a_valuelen; /* number bytes in value of attr */ + char a_name[1]; /* attr name (NULL terminated) */ +} attrlist_ent_t; + +/* + * Given a pointer to the (char*) buffer containing the attr_list() result, + * and an index, return a pointer to the indicated attribute in the buffer. + */ +#define ATTR_ENTRY(buffer, index) \ + ((attrlist_ent_t *) \ + &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ]) + +/* + * Multi-attribute operation vector. + */ +typedef struct attr_multiop { + int am_opcode; /* operation to perform (ATTR_OP_GET, etc.) */ + int am_error; /* [out arg] result of this sub-op (an errno) */ + char *am_attrname; /* attribute name to work with */ + char *am_attrvalue; /* [in/out arg] attribute value (raw bytes) */ + int am_length; /* [in/out arg] length of value */ + int am_flags; /* bitwise OR of attr API flags defined above */ +} attr_multiop_t; + +#define ATTR_OP_GET 1 /* return the indicated attr's value */ +#define ATTR_OP_SET 2 /* set/create the indicated attr/value pair */ +#define ATTR_OP_REMOVE 3 /* remove the indicated attr */ + +/* + * Kernel-internal version of the attrlist cursor. + */ +typedef struct attrlist_cursor_kern { + __u32 hashval; /* hash value of next entry to add */ + __u32 blkno; /* block containing entry (suggestion) */ + __u32 offset; /* offset in list of equal-hashvals */ + __u16 pad1; /* padding to match user-level */ + __u8 pad2; /* padding to match user-level */ + __u8 initted; /* T/F: cursor has been initialized */ +} attrlist_cursor_kern_t; + + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +struct cred; +struct vnode; +struct xfs_inode; +struct attrlist_cursor_kern; +struct xfs_ext_attr; +struct xfs_da_args; + +/* + * Overall external interface routines. + */ +int xfs_attr_get(bhv_desc_t *, char *, char *, int *, int, struct cred *); +int xfs_attr_set(bhv_desc_t *, char *, char *, int, int, struct cred *); +int xfs_attr_remove(bhv_desc_t *, char *, int, struct cred *); +int xfs_attr_list(bhv_desc_t *, char *, int, int, + struct attrlist_cursor_kern *, struct cred *); +int xfs_attr_inactive(struct xfs_inode *dp); + +int xfs_attr_node_get(struct xfs_da_args *); +int xfs_attr_leaf_get(struct xfs_da_args *); +int xfs_attr_shortform_getvalue(struct xfs_da_args *); +int xfs_attr_fetch(struct xfs_inode *, char *, char *, int); + +#endif /* __XFS_ATTR_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_attr_fetch.c linux-2.4-xfs/fs/xfs/xfs_attr_fetch.c --- linux-2.4.19/fs/xfs/xfs_attr_fetch.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_attr_fetch.c Wed Jul 10 23:13:50 2002 @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2000, 2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include + +int +xfs_attr_fetch(xfs_inode_t *ip, char *name, char *value, int valuelen) +{ + xfs_da_args_t args; + int error; + + if (XFS_IFORK_Q(ip) == 0) + return ENOATTR; + /* + * Do the argument setup for the xfs_attr routines. + */ + bzero((char *)&args, sizeof(args)); + args.dp = ip; + args.flags = ATTR_ROOT; + args.whichfork = XFS_ATTR_FORK; + args.name = name; + args.namelen = strlen(name); + args.value = value; + args.valuelen = valuelen; + args.hashval = xfs_da_hashname(args.name, args.namelen); + args.oknoent = 1; + + /* + * Decide on what work routines to call based on the inode size. + */ + if (args.dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) + error = xfs_attr_shortform_getvalue(&args); + else if (xfs_bmap_one_block(args.dp, XFS_ATTR_FORK)) + error = xfs_attr_leaf_get(&args); + else + error = xfs_attr_node_get(&args); + + if (error == EEXIST) + error = 0; + + return(error); +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_attr_leaf.c linux-2.4-xfs/fs/xfs/xfs_attr_leaf.c --- linux-2.4.19/fs/xfs/xfs_attr_leaf.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_attr_leaf.c Thu Aug 8 20:03:32 2002 @@ -0,0 +1,2971 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +/* + * xfs_attr_leaf.c + * + * GROT: figure out how to recover gracefully when bmap returns ENOSPC. + */ + +#include + +/* + * xfs_attr_leaf.c + * + * Routines to implement leaf blocks of attributes as Btrees of hashed names. + */ + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Routines used for growing the Btree. + */ +STATIC int xfs_attr_leaf_add_work(xfs_dabuf_t *leaf_buffer, xfs_da_args_t *args, + int freemap_index); +STATIC void xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *leaf_buffer); +STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state, + xfs_da_state_blk_t *blk1, + xfs_da_state_blk_t *blk2); +STATIC int xfs_attr_leaf_figure_balance(xfs_da_state_t *state, + xfs_da_state_blk_t *leaf_blk_1, + xfs_da_state_blk_t *leaf_blk_2, + int *number_entries_in_blk1, + int *number_usedbytes_in_blk1); + +/* + * Utility routines. + */ +STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf, + int src_start, + xfs_attr_leafblock_t *dst_leaf, + int dst_start, int move_count, + xfs_mount_t *mp); + + +/*======================================================================== + * External routines when dirsize < XFS_LITINO(mp). + *========================================================================*/ + +/* + * Create the initial contents of a shortform attribute list. + */ +int +xfs_attr_shortform_create(xfs_da_args_t *args) +{ + xfs_attr_sf_hdr_t *hdr; + xfs_inode_t *dp; + xfs_ifork_t *ifp; + + dp = args->dp; + ASSERT(dp != NULL); + ifp = dp->i_afp; + ASSERT(ifp != NULL); + ASSERT(ifp->if_bytes == 0); + if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) { + ifp->if_flags &= ~XFS_IFEXTENTS; /* just in case */ + dp->i_d.di_aformat = XFS_DINODE_FMT_LOCAL; + ifp->if_flags |= XFS_IFINLINE; + } else { + ASSERT(ifp->if_flags & XFS_IFINLINE); + } + xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK); + hdr = (xfs_attr_sf_hdr_t *)ifp->if_u1.if_data; + INT_ZERO(hdr->count, ARCH_CONVERT); + INT_SET(hdr->totsize, ARCH_CONVERT, sizeof(*hdr)); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); + return(0); +} + +/* + * Add a name/value pair to the shortform attribute list. + * Overflow from the inode has already been checked for. + */ +int +xfs_attr_shortform_add(xfs_da_args_t *args) +{ + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + int i, offset, size; + xfs_inode_t *dp; + xfs_ifork_t *ifp; + + dp = args->dp; + ifp = dp->i_afp; + ASSERT(ifp->if_flags & XFS_IFINLINE); + sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sfe = &sf->list[0]; + for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT); + sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { + if (sfe->namelen != args->namelen) + continue; + if (bcmp(args->name, sfe->nameval, args->namelen) != 0) + continue; + if (((args->flags & ATTR_ROOT) != 0) != + ((sfe->flags & XFS_ATTR_ROOT) != 0)) + continue; + return(XFS_ERROR(EEXIST)); + } + + offset = (char *)sfe - (char *)sf; + size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); + xfs_idata_realloc(dp, size, XFS_ATTR_FORK); + sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset); + + sfe->namelen = args->namelen; + INT_SET(sfe->valuelen, ARCH_CONVERT, args->valuelen); + sfe->flags = (args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0; + bcopy(args->name, sfe->nameval, args->namelen); + bcopy(args->value, &sfe->nameval[args->namelen], args->valuelen); + INT_MOD(sf->hdr.count, ARCH_CONVERT, 1); + INT_MOD(sf->hdr.totsize, ARCH_CONVERT, size); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); + + return(0); +} + +/* + * Remove a name from the shortform attribute list structure. + */ +int +xfs_attr_shortform_remove(xfs_da_args_t *args) +{ + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + int base, size=0, end, totsize, i; + xfs_inode_t *dp; + + /* + * Remove the attribute. + */ + dp = args->dp; + base = sizeof(xfs_attr_sf_hdr_t); + sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; + sfe = &sf->list[0]; + for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT); + sfe = XFS_ATTR_SF_NEXTENTRY(sfe), + base += size, i++) { + size = XFS_ATTR_SF_ENTSIZE(sfe); + if (sfe->namelen != args->namelen) + continue; + if (bcmp(sfe->nameval, args->name, args->namelen) != 0) + continue; + if (((args->flags & ATTR_ROOT) != 0) != + ((sfe->flags & XFS_ATTR_ROOT) != 0)) + continue; + break; + } + if (i == INT_GET(sf->hdr.count, ARCH_CONVERT)) + return(XFS_ERROR(ENOATTR)); + + end = base + size; + totsize = INT_GET(sf->hdr.totsize, ARCH_CONVERT); + if (end != totsize) { + ovbcopy(&((char *)sf)[end], &((char *)sf)[base], + totsize - end); + } + INT_MOD(sf->hdr.count, ARCH_CONVERT, -1); + INT_MOD(sf->hdr.totsize, ARCH_CONVERT, -size); + xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); + + return(0); +} + +/* + * Look up a name in a shortform attribute list structure. + */ +/*ARGSUSED*/ +int +xfs_attr_shortform_lookup(xfs_da_args_t *args) +{ + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + int i; + xfs_ifork_t *ifp; + + ifp = args->dp->i_afp; + ASSERT(ifp->if_flags & XFS_IFINLINE); + sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sfe = &sf->list[0]; + for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT); + sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { + if (sfe->namelen != args->namelen) + continue; + if (bcmp(args->name, sfe->nameval, args->namelen) != 0) + continue; + if (((args->flags & ATTR_ROOT) != 0) != + ((sfe->flags & XFS_ATTR_ROOT) != 0)) + continue; + return(XFS_ERROR(EEXIST)); + } + return(XFS_ERROR(ENOATTR)); +} + +/* + * Look up a name in a shortform attribute list structure. + */ +/*ARGSUSED*/ +int +xfs_attr_shortform_getvalue(xfs_da_args_t *args) +{ + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + int i; + + ASSERT(args->dp->i_d.di_aformat == XFS_IFINLINE); + sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data; + sfe = &sf->list[0]; + for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT); + sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { + if (sfe->namelen != args->namelen) + continue; + if (bcmp(args->name, sfe->nameval, args->namelen) != 0) + continue; + if (((args->flags & ATTR_ROOT) != 0) != + ((sfe->flags & XFS_ATTR_ROOT) != 0)) + continue; + if (args->flags & ATTR_KERNOVAL) { + args->valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT); + return(XFS_ERROR(EEXIST)); + } + if (args->valuelen < INT_GET(sfe->valuelen, ARCH_CONVERT)) { + args->valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT); + return(XFS_ERROR(ERANGE)); + } + args->valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT); + bcopy(&sfe->nameval[args->namelen], args->value, + args->valuelen); + return(XFS_ERROR(EEXIST)); + } + return(XFS_ERROR(ENOATTR)); +} + +/* + * Convert from using the shortform to the leaf. + */ +int +xfs_attr_shortform_to_leaf(xfs_da_args_t *args) +{ + xfs_inode_t *dp; + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + xfs_da_args_t nargs; + char *tmpbuffer; + int error, i, size; + xfs_dablk_t blkno; + xfs_dabuf_t *bp; + xfs_ifork_t *ifp; + + dp = args->dp; + ifp = dp->i_afp; + sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + size = INT_GET(sf->hdr.totsize, ARCH_CONVERT); + tmpbuffer = kmem_alloc(size, KM_SLEEP); + ASSERT(tmpbuffer != NULL); + bcopy(ifp->if_u1.if_data, tmpbuffer, size); + sf = (xfs_attr_shortform_t *)tmpbuffer; + + xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); + bp = NULL; + error = xfs_da_grow_inode(args, &blkno); + if (error) { + /* + * If we hit an IO error middle of the transaction inside + * grow_inode(), we may have inconsistent data. Bail out. + */ + if (error == EIO) + goto out; + xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */ + bcopy(tmpbuffer, ifp->if_u1.if_data, size); /* it back */ + goto out; + } + + ASSERT(blkno == 0); + error = xfs_attr_leaf_create(args, blkno, &bp); + if (error) { + error = xfs_da_shrink_inode(args, 0, bp); + bp = NULL; + if (error) + goto out; + xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */ + bcopy(tmpbuffer, ifp->if_u1.if_data, size); /* it back */ + goto out; + } + + bzero((char *)&nargs, sizeof(nargs)); + nargs.dp = dp; + nargs.firstblock = args->firstblock; + nargs.flist = args->flist; + nargs.total = args->total; + nargs.whichfork = XFS_ATTR_FORK; + nargs.trans = args->trans; + nargs.oknoent = 1; + + sfe = &sf->list[0]; + for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) { + nargs.name = (char *)sfe->nameval; + nargs.namelen = sfe->namelen; + nargs.value = (char *)&sfe->nameval[nargs.namelen]; + nargs.valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT); + nargs.hashval = xfs_da_hashname((char *)sfe->nameval, + sfe->namelen); + nargs.flags = (sfe->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0; + error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */ + ASSERT(error == ENOATTR); + error = xfs_attr_leaf_add(bp, &nargs); + ASSERT(error != ENOSPC); + if (error) + goto out; + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + } + error = 0; + +out: + if(bp) + xfs_da_buf_done(bp); + kmem_free(tmpbuffer, size); + return(error); +} + +STATIC int +xfs_attr_shortform_compare(const void *a, const void *b) +{ + xfs_attr_sf_sort_t *sa, *sb; + + sa = (xfs_attr_sf_sort_t *)a; + sb = (xfs_attr_sf_sort_t *)b; + if (INT_GET(sa->hash, ARCH_CONVERT) + < INT_GET(sb->hash, ARCH_CONVERT)) { + return(-1); + } else if (INT_GET(sa->hash, ARCH_CONVERT) + > INT_GET(sb->hash, ARCH_CONVERT)) { + return(1); + } else { + return(sa->entno - sb->entno); + } +} + +/* + * Copy out entries of shortform attribute lists for attr_list(). + * Shortform atrtribute lists are not stored in hashval sorted order. + * If the output buffer is not large enough to hold them all, then we + * we have to calculate each entries' hashvalue and sort them before + * we can begin returning them to the user. + */ +/*ARGSUSED*/ +int +xfs_attr_shortform_list(xfs_attr_list_context_t *context) +{ + attrlist_cursor_kern_t *cursor; + xfs_attr_sf_sort_t *sbuf, *sbp; + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + xfs_inode_t *dp; + int sbsize, nsbuf, count, i; + + ASSERT(context != NULL); + dp = context->dp; + ASSERT(dp != NULL); + ASSERT(dp->i_afp != NULL); + sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; + ASSERT(sf != NULL); + if (INT_ISZERO(sf->hdr.count, ARCH_CONVERT)) + return(0); + cursor = context->cursor; + ASSERT(cursor != NULL); + + xfs_attr_trace_l_c("sf start", context); + + /* + * If the buffer is large enough, do not bother with sorting. + * Note the generous fudge factor of 16 overhead bytes per entry. + */ + if ((dp->i_afp->if_bytes + INT_GET(sf->hdr.count, ARCH_CONVERT) * 16) + < context->bufsize) { + for (i = 0, sfe = &sf->list[0]; + i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) { + int ns = (sfe->flags & XFS_ATTR_ROOT)? + ROOT_NAMES : USER_NAMES; + if (((context->flags & ATTR_ROOT) != 0) != + ((sfe->flags & XFS_ATTR_ROOT) != 0) && + !(context->flags & ATTR_KERNFULLS)) { + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + continue; + } + if (context->flags & ATTR_KERNOVAL) { + ASSERT(context->flags & ATTR_KERNAMELS); + context->count += xfs_namespaces[ns].namelen + + INT_GET(sfe->namelen, ARCH_CONVERT) + 1; + } + else { + if (xfs_attr_put_listent(context, ns, + (char *)sfe->nameval, + (int)sfe->namelen, + (int)INT_GET(sfe->valuelen, + ARCH_CONVERT))) + break; + } + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + } + xfs_attr_trace_l_c("sf big-gulp", context); + return(0); + } + + /* + * It didn't all fit, so we have to sort everything on hashval. + */ + sbsize = INT_GET(sf->hdr.count, ARCH_CONVERT) * sizeof(*sbuf); + sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP); + + /* + * Scan the attribute list for the rest of the entries, storing + * the relevant info from only those that match into a buffer. + */ + nsbuf = 0; + for (i = 0, sfe = &sf->list[0]; + i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) { + if (((char *)sfe < (char *)sf) || + ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)) || + (sfe->namelen >= MAXNAMELEN)) { + xfs_attr_trace_l_c("sf corrupted", context); + kmem_free(sbuf, sbsize); + return XFS_ERROR(EFSCORRUPTED); + } + if (((context->flags & ATTR_ROOT) != 0) != + ((sfe->flags & XFS_ATTR_ROOT) != 0) && + !(context->flags & ATTR_KERNFULLS)) { + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + continue; + } + sbp->entno = i; + INT_SET(sbp->hash, ARCH_CONVERT, + xfs_da_hashname((char *)sfe->nameval, sfe->namelen)); + sbp->name = (char *)sfe->nameval; + sbp->namelen = sfe->namelen; + /* These are bytes, and both on-disk, don't endian-flip */ + sbp->valuelen = sfe->valuelen; + sbp->flags = sfe->flags; + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + sbp++; + nsbuf++; + } + + /* + * Sort the entries on hash then entno. + */ + qsort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare); + + /* + * Re-find our place IN THE SORTED LIST. + */ + count = 0; + cursor->initted = 1; + cursor->blkno = 0; + for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) { + if (INT_GET(sbp->hash, ARCH_CONVERT) == cursor->hashval) { + if (cursor->offset == count) { + break; + } + count++; + } else if (INT_GET(sbp->hash, ARCH_CONVERT) > cursor->hashval) { + break; + } + } + if (i == nsbuf) { + kmem_free(sbuf, sbsize); + xfs_attr_trace_l_c("blk end", context); + return(0); + } + + /* + * Loop putting entries into the user buffer. + */ + for ( ; i < nsbuf; i++, sbp++) { + int ns = (sbp->flags & XFS_ATTR_ROOT)? ROOT_NAMES:USER_NAMES; + if (cursor->hashval != INT_GET(sbp->hash, ARCH_CONVERT)) { + cursor->hashval = INT_GET(sbp->hash, ARCH_CONVERT); + cursor->offset = 0; + } + if (context->flags & ATTR_KERNOVAL) { + ASSERT(context->flags & ATTR_KERNAMELS); + context->count += xfs_namespaces[ns].namelen + + sbp->namelen + 1; + } + else { + if (xfs_attr_put_listent(context, ns, + sbp->name, sbp->namelen, + INT_GET(sbp->valuelen, ARCH_CONVERT))) + break; + } + cursor->offset++; + } + + kmem_free(sbuf, sbsize); + xfs_attr_trace_l_c("sf E-O-F", context); + return(0); +} + +/* + * Check a leaf attribute block to see if all the entries would fit into + * a shortform attribute list. + */ +int +xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_local_t *name_loc; + int bytes, i; + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + + entry = &leaf->entries[0]; + bytes = sizeof(struct xfs_attr_sf_hdr); + for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) { + if (entry->flags & XFS_ATTR_INCOMPLETE) + continue; /* don't copy partial entries */ + if (!(entry->flags & XFS_ATTR_LOCAL)) + return(0); + name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i); + if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX) + return(0); + if (INT_GET(name_loc->valuelen, ARCH_CONVERT) >= XFS_ATTR_SF_ENTSIZE_MAX) + return(0); + bytes += sizeof(struct xfs_attr_sf_entry)-1 + + name_loc->namelen + + INT_GET(name_loc->valuelen, ARCH_CONVERT); + } + return( bytes < XFS_IFORK_ASIZE(dp) ); +} + +/* + * Convert a leaf attribute list to shortform attribute list + */ +int +xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_local_t *name_loc; + xfs_da_args_t nargs; + xfs_inode_t *dp; + char *tmpbuffer; + int error, i; + + dp = args->dp; + tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP); + ASSERT(tmpbuffer != NULL); + + ASSERT(bp != NULL); + bcopy(bp->data, tmpbuffer, XFS_LBSIZE(dp->i_mount)); + leaf = (xfs_attr_leafblock_t *)tmpbuffer; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + bzero(bp->data, XFS_LBSIZE(dp->i_mount)); + + /* + * Clean out the prior contents of the attribute list. + */ + error = xfs_da_shrink_inode(args, 0, bp); + if (error) + goto out; + error = xfs_attr_shortform_create(args); + if (error) + goto out; + + /* + * Copy the attributes + */ + bzero((char *)&nargs, sizeof(nargs)); + nargs.dp = dp; + nargs.firstblock = args->firstblock; + nargs.flist = args->flist; + nargs.total = args->total; + nargs.whichfork = XFS_ATTR_FORK; + nargs.trans = args->trans; + nargs.oknoent = 1; + entry = &leaf->entries[0]; + for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) { + if (entry->flags & XFS_ATTR_INCOMPLETE) + continue; /* don't copy partial entries */ + if (INT_ISZERO(entry->nameidx, ARCH_CONVERT)) + continue; + ASSERT(entry->flags & XFS_ATTR_LOCAL); + name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i); + nargs.name = (char *)name_loc->nameval; + nargs.namelen = name_loc->namelen; + nargs.value = (char *)&name_loc->nameval[nargs.namelen]; + nargs.valuelen = INT_GET(name_loc->valuelen, ARCH_CONVERT); + nargs.hashval = INT_GET(entry->hashval, ARCH_CONVERT); + nargs.flags = (entry->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0; + xfs_attr_shortform_add(&nargs); + } + error = 0; + +out: + kmem_free(tmpbuffer, XFS_LBSIZE(dp->i_mount)); + return(error); +} + +/* + * Convert from using a single leaf to a root node and a leaf. + */ +int +xfs_attr_leaf_to_node(xfs_da_args_t *args) +{ + xfs_attr_leafblock_t *leaf; + xfs_da_intnode_t *node; + xfs_inode_t *dp; + xfs_dabuf_t *bp1, *bp2; + xfs_dablk_t blkno; + int error; + + dp = args->dp; + bp1 = bp2 = NULL; + error = xfs_da_grow_inode(args, &blkno); + if (error) + goto out; + error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1, + XFS_ATTR_FORK); + if (error) + goto out; + ASSERT(bp1 != NULL); + bp2 = NULL; + error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2, + XFS_ATTR_FORK); + if (error) + goto out; + ASSERT(bp2 != NULL); + bcopy(bp1->data, bp2->data, XFS_LBSIZE(dp->i_mount)); + xfs_da_buf_done(bp1); + bp1 = NULL; + xfs_da_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); + + /* + * Set up the new root node. + */ + error = xfs_da_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK); + if (error) + goto out; + node = bp1->data; + leaf = bp2->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + /* both on-disk, don't endian-flip twice */ + node->btree[0].hashval = + leaf->entries[INT_GET(leaf->hdr.count, ARCH_CONVERT)-1 ].hashval; + INT_SET(node->btree[0].before, ARCH_CONVERT, blkno); + INT_SET(node->hdr.count, ARCH_CONVERT, 1); + xfs_da_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1); + error = 0; +out: + if (bp1) + xfs_da_buf_done(bp1); + if (bp2) + xfs_da_buf_done(bp2); + return(error); +} + + +/*======================================================================== + * Routines used for growing the Btree. + *========================================================================*/ + +/* + * Create the initial contents of a leaf attribute list + * or a leaf in a node attribute list. + */ +int +xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_hdr_t *hdr; + xfs_inode_t *dp; + xfs_dabuf_t *bp; + int error; + + dp = args->dp; + ASSERT(dp != NULL); + error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) + return(error); + ASSERT(bp != NULL); + leaf = bp->data; + bzero((char *)leaf, XFS_LBSIZE(dp->i_mount)); + hdr = &leaf->hdr; + INT_SET(hdr->info.magic, ARCH_CONVERT, XFS_ATTR_LEAF_MAGIC); + INT_SET(hdr->firstused, ARCH_CONVERT, XFS_LBSIZE(dp->i_mount)); + if (INT_ISZERO(hdr->firstused, ARCH_CONVERT)) { + INT_SET(hdr->firstused, ARCH_CONVERT, + XFS_LBSIZE(dp->i_mount) - XFS_ATTR_LEAF_NAME_ALIGN); + } + + INT_SET(hdr->freemap[0].base, ARCH_CONVERT, + sizeof(xfs_attr_leaf_hdr_t)); + INT_SET(hdr->freemap[0].size, ARCH_CONVERT, + INT_GET(hdr->firstused, ARCH_CONVERT) + - INT_GET(hdr->freemap[0].base, + ARCH_CONVERT)); + + xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1); + + *bpp = bp; + return(0); +} + +/* + * Split the leaf node, rebalance, then add the new entry. + */ +int +xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, + xfs_da_state_blk_t *newblk) +{ + xfs_dablk_t blkno; + int error; + + /* + * Allocate space for a new leaf node. + */ + ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC); + error = xfs_da_grow_inode(state->args, &blkno); + if (error) + return(error); + error = xfs_attr_leaf_create(state->args, blkno, &newblk->bp); + if (error) + return(error); + newblk->blkno = blkno; + newblk->magic = XFS_ATTR_LEAF_MAGIC; + + /* + * Rebalance the entries across the two leaves. + * NOTE: rebalance() currently depends on the 2nd block being empty. + */ + xfs_attr_leaf_rebalance(state, oldblk, newblk); + error = xfs_da_blk_link(state, oldblk, newblk); + if (error) + return(error); + + /* + * Save info on "old" attribute for "atomic rename" ops, leaf_add() + * modifies the index/blkno/rmtblk/rmtblkcnt fields to show the + * "new" attrs info. Will need the "old" info to remove it later. + * + * Insert the "new" entry in the correct block. + */ + if (state->inleaf) + error = xfs_attr_leaf_add(oldblk->bp, state->args); + else + error = xfs_attr_leaf_add(newblk->bp, state->args); + + /* + * Update last hashval in each block since we added the name. + */ + oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL); + newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL); + return(error); +} + +/* + * Add a name to the leaf attribute list structure. + */ +int +xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_hdr_t *hdr; + xfs_attr_leaf_map_t *map; + int tablesize, entsize, sum, tmp, i; + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + ASSERT((args->index >= 0) + && (args->index <= INT_GET(leaf->hdr.count, ARCH_CONVERT))); + hdr = &leaf->hdr; + entsize = xfs_attr_leaf_newentsize(args, + args->trans->t_mountp->m_sb.sb_blocksize, NULL); + + /* + * Search through freemap for first-fit on new name length. + * (may need to figure in size of entry struct too) + */ + tablesize = (INT_GET(hdr->count, ARCH_CONVERT) + 1) + * sizeof(xfs_attr_leaf_entry_t) + + sizeof(xfs_attr_leaf_hdr_t); + map = &hdr->freemap[XFS_ATTR_LEAF_MAPSIZE-1]; + for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE-1; i >= 0; map--, i--) { + if (tablesize > INT_GET(hdr->firstused, ARCH_CONVERT)) { + sum += INT_GET(map->size, ARCH_CONVERT); + continue; + } + if (INT_ISZERO(map->size, ARCH_CONVERT)) + continue; /* no space in this map */ + tmp = entsize; + if (INT_GET(map->base, ARCH_CONVERT) + < INT_GET(hdr->firstused, ARCH_CONVERT)) + tmp += sizeof(xfs_attr_leaf_entry_t); + if (INT_GET(map->size, ARCH_CONVERT) >= tmp) { + tmp = xfs_attr_leaf_add_work(bp, args, i); + return(tmp); + } + sum += INT_GET(map->size, ARCH_CONVERT); + } + + /* + * If there are no holes in the address space of the block, + * and we don't have enough freespace, then compaction will do us + * no good and we should just give up. + */ + if (!hdr->holes && (sum < entsize)) + return(XFS_ERROR(ENOSPC)); + + /* + * Compact the entries to coalesce free space. + * This may change the hdr->count via dropping INCOMPLETE entries. + */ + xfs_attr_leaf_compact(args->trans, bp); + + /* + * After compaction, the block is guaranteed to have only one + * free region, in freemap[0]. If it is not big enough, give up. + */ + if (INT_GET(hdr->freemap[0].size, ARCH_CONVERT) + < (entsize + sizeof(xfs_attr_leaf_entry_t))) + return(XFS_ERROR(ENOSPC)); + + return(xfs_attr_leaf_add_work(bp, args, 0)); +} + +/* + * Add a name to a leaf attribute list structure. + */ +STATIC int +xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_hdr_t *hdr; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_local_t *name_loc; + xfs_attr_leaf_name_remote_t *name_rmt; + xfs_attr_leaf_map_t *map; + xfs_mount_t *mp; + int tmp, i; + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + hdr = &leaf->hdr; + ASSERT((mapindex >= 0) && (mapindex < XFS_ATTR_LEAF_MAPSIZE)); + ASSERT((args->index >= 0) + && (args->index <= INT_GET(hdr->count, ARCH_CONVERT))); + + /* + * Force open some space in the entry array and fill it in. + */ + entry = &leaf->entries[args->index]; + if (args->index < INT_GET(hdr->count, ARCH_CONVERT)) { + tmp = INT_GET(hdr->count, ARCH_CONVERT) - args->index; + tmp *= sizeof(xfs_attr_leaf_entry_t); + ovbcopy((char *)entry, (char *)(entry+1), tmp); + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); + } + INT_MOD(hdr->count, ARCH_CONVERT, 1); + + /* + * Allocate space for the new string (at the end of the run). + */ + map = &hdr->freemap[mapindex]; + mp = args->trans->t_mountp; + ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp)); + ASSERT((INT_GET(map->base, ARCH_CONVERT) & 0x3) == 0); + ASSERT(INT_GET(map->size, ARCH_CONVERT) + >= xfs_attr_leaf_newentsize(args, + mp->m_sb.sb_blocksize, NULL)); + ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp)); + ASSERT((INT_GET(map->size, ARCH_CONVERT) & 0x3) == 0); + INT_MOD(map->size, ARCH_CONVERT, + -xfs_attr_leaf_newentsize(args, mp->m_sb.sb_blocksize, &tmp)); + INT_SET(entry->nameidx, ARCH_CONVERT, + INT_GET(map->base, ARCH_CONVERT) + + INT_GET(map->size, ARCH_CONVERT)); + INT_SET(entry->hashval, ARCH_CONVERT, args->hashval); + entry->flags = tmp ? XFS_ATTR_LOCAL : 0; + entry->flags |= (args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0; + if (args->rename) { + entry->flags |= XFS_ATTR_INCOMPLETE; + if ((args->blkno2 == args->blkno) && + (args->index2 <= args->index)) { + args->index2++; + } + } + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); + ASSERT((args->index == 0) || (INT_GET(entry->hashval, ARCH_CONVERT) + >= INT_GET((entry-1)->hashval, + ARCH_CONVERT))); + ASSERT((args->index == INT_GET(hdr->count, ARCH_CONVERT)-1) || + (INT_GET(entry->hashval, ARCH_CONVERT) + <= (INT_GET((entry+1)->hashval, ARCH_CONVERT)))); + + /* + * Copy the attribute name and value into the new space. + * + * For "remote" attribute values, simply note that we need to + * allocate space for the "remote" value. We can't actually + * allocate the extents in this transaction, and we can't decide + * which blocks they should be as we might allocate more blocks + * as part of this transaction (a split operation for example). + */ + if (entry->flags & XFS_ATTR_LOCAL) { + name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index); + name_loc->namelen = args->namelen; + INT_SET(name_loc->valuelen, ARCH_CONVERT, args->valuelen); + bcopy(args->name, (char *)name_loc->nameval, args->namelen); + bcopy(args->value, (char *)&name_loc->nameval[args->namelen], + INT_GET(name_loc->valuelen, ARCH_CONVERT)); + } else { + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index); + name_rmt->namelen = args->namelen; + bcopy(args->name, (char *)name_rmt->name, args->namelen); + entry->flags |= XFS_ATTR_INCOMPLETE; + /* just in case */ + INT_ZERO(name_rmt->valuelen, ARCH_CONVERT); + INT_ZERO(name_rmt->valueblk, ARCH_CONVERT); + args->rmtblkno = 1; + args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen); + } + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index), + xfs_attr_leaf_entsize(leaf, args->index))); + + /* + * Update the control info for this leaf node + */ + if (INT_GET(entry->nameidx, ARCH_CONVERT) + < INT_GET(hdr->firstused, ARCH_CONVERT)) { + /* both on-disk, don't endian-flip twice */ + hdr->firstused = entry->nameidx; + } + ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT) + >= ((INT_GET(hdr->count, ARCH_CONVERT) + * sizeof(*entry))+sizeof(*hdr))); + tmp = (INT_GET(hdr->count, ARCH_CONVERT)-1) + * sizeof(xfs_attr_leaf_entry_t) + + sizeof(xfs_attr_leaf_hdr_t); + map = &hdr->freemap[0]; + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) { + if (INT_GET(map->base, ARCH_CONVERT) == tmp) { + INT_MOD(map->base, ARCH_CONVERT, + sizeof(xfs_attr_leaf_entry_t)); + INT_MOD(map->size, ARCH_CONVERT, + -sizeof(xfs_attr_leaf_entry_t)); + } + } + INT_MOD(hdr->usedbytes, ARCH_CONVERT, + xfs_attr_leaf_entsize(leaf, args->index)); + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr))); + return(0); +} + +/* + * Garbage collect a leaf attribute list block by copying it to a new buffer. + */ +STATIC void +xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp) +{ + xfs_attr_leafblock_t *leaf_s, *leaf_d; + xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; + xfs_mount_t *mp; + char *tmpbuffer; + + mp = trans->t_mountp; + tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP); + ASSERT(tmpbuffer != NULL); + bcopy(bp->data, tmpbuffer, XFS_LBSIZE(mp)); + bzero(bp->data, XFS_LBSIZE(mp)); + + /* + * Copy basic information + */ + leaf_s = (xfs_attr_leafblock_t *)tmpbuffer; + leaf_d = bp->data; + hdr_s = &leaf_s->hdr; + hdr_d = &leaf_d->hdr; + hdr_d->info = hdr_s->info; /* struct copy */ + INT_SET(hdr_d->firstused, ARCH_CONVERT, XFS_LBSIZE(mp)); + /* handle truncation gracefully */ + if (INT_ISZERO(hdr_d->firstused, ARCH_CONVERT)) { + INT_SET(hdr_d->firstused, ARCH_CONVERT, + XFS_LBSIZE(mp) - XFS_ATTR_LEAF_NAME_ALIGN); + } + INT_ZERO(hdr_d->usedbytes, ARCH_CONVERT); + INT_ZERO(hdr_d->count, ARCH_CONVERT); + hdr_d->holes = 0; + INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT, + sizeof(xfs_attr_leaf_hdr_t)); + INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT, + INT_GET(hdr_d->firstused, ARCH_CONVERT) + - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT)); + + /* + * Copy all entry's in the same (sorted) order, + * but allocate name/value pairs packed and in sequence. + */ + xfs_attr_leaf_moveents(leaf_s, 0, leaf_d, 0, + (int)INT_GET(hdr_s->count, ARCH_CONVERT), mp); + + xfs_da_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1); + + kmem_free(tmpbuffer, XFS_LBSIZE(mp)); +} + +/* + * Redistribute the attribute list entries between two leaf nodes, + * taking into account the size of the new entry. + * + * NOTE: if new block is empty, then it will get the upper half of the + * old block. At present, all (one) callers pass in an empty second block. + * + * This code adjusts the args->index/blkno and args->index2/blkno2 fields + * to match what it is doing in splitting the attribute leaf block. Those + * values are used in "atomic rename" operations on attributes. Note that + * the "new" and "old" values can end up in different blocks. + */ +STATIC void +xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, + xfs_da_state_blk_t *blk2) +{ + xfs_da_args_t *args; + xfs_da_state_blk_t *tmp_blk; + xfs_attr_leafblock_t *leaf1, *leaf2; + xfs_attr_leaf_hdr_t *hdr1, *hdr2; + int count, totallen, max, space, swap; + + /* + * Set up environment. + */ + ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC); + ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC); + leaf1 = blk1->bp->data; + leaf2 = blk2->bp->data; + ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + args = state->args; + + /* + * Check ordering of blocks, reverse if it makes things simpler. + * + * NOTE: Given that all (current) callers pass in an empty + * second block, this code should never set "swap". + */ + swap = 0; + if (xfs_attr_leaf_order(blk1->bp, blk2->bp)) { + tmp_blk = blk1; + blk1 = blk2; + blk2 = tmp_blk; + leaf1 = blk1->bp->data; + leaf2 = blk2->bp->data; + swap = 1; + } + hdr1 = &leaf1->hdr; + hdr2 = &leaf2->hdr; + + /* + * Examine entries until we reduce the absolute difference in + * byte usage between the two blocks to a minimum. Then get + * the direction to copy and the number of elements to move. + * + * "inleaf" is true if the new entry should be inserted into blk1. + * If "swap" is also true, then reverse the sense of "inleaf". + */ + state->inleaf = xfs_attr_leaf_figure_balance(state, blk1, blk2, + &count, &totallen); + if (swap) + state->inleaf = !state->inleaf; + + /* + * Move any entries required from leaf to leaf: + */ + if (count < INT_GET(hdr1->count, ARCH_CONVERT)) { + /* + * Figure the total bytes to be added to the destination leaf. + */ + /* number entries being moved */ + count = INT_GET(hdr1->count, ARCH_CONVERT) - count; + space = INT_GET(hdr1->usedbytes, ARCH_CONVERT) - totallen; + space += count * sizeof(xfs_attr_leaf_entry_t); + + /* + * leaf2 is the destination, compact it if it looks tight. + */ + max = INT_GET(hdr2->firstused, ARCH_CONVERT) + - sizeof(xfs_attr_leaf_hdr_t); + max -= INT_GET(hdr2->count, ARCH_CONVERT) + * sizeof(xfs_attr_leaf_entry_t); + if (space > max) { + xfs_attr_leaf_compact(args->trans, blk2->bp); + } + + /* + * Move high entries from leaf1 to low end of leaf2. + */ + xfs_attr_leaf_moveents(leaf1, + INT_GET(hdr1->count, ARCH_CONVERT)-count, + leaf2, 0, count, state->mp); + + xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); + xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); + } else if (count > INT_GET(hdr1->count, ARCH_CONVERT)) { + /* + * I assert that since all callers pass in an empty + * second buffer, this code should never execute. + */ + + /* + * Figure the total bytes to be added to the destination leaf. + */ + /* number entries being moved */ + count -= INT_GET(hdr1->count, ARCH_CONVERT); + space = totallen - INT_GET(hdr1->usedbytes, ARCH_CONVERT); + space += count * sizeof(xfs_attr_leaf_entry_t); + + /* + * leaf1 is the destination, compact it if it looks tight. + */ + max = INT_GET(hdr1->firstused, ARCH_CONVERT) + - sizeof(xfs_attr_leaf_hdr_t); + max -= INT_GET(hdr1->count, ARCH_CONVERT) + * sizeof(xfs_attr_leaf_entry_t); + if (space > max) { + xfs_attr_leaf_compact(args->trans, blk1->bp); + } + + /* + * Move low entries from leaf2 to high end of leaf1. + */ + xfs_attr_leaf_moveents(leaf2, 0, leaf1, + (int)INT_GET(hdr1->count, ARCH_CONVERT), count, + state->mp); + + xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); + xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); + } + + /* + * Copy out last hashval in each block for B-tree code. + */ + blk1->hashval = + INT_GET(leaf1->entries[INT_GET(leaf1->hdr.count, + ARCH_CONVERT)-1].hashval, ARCH_CONVERT); + blk2->hashval = + INT_GET(leaf2->entries[INT_GET(leaf2->hdr.count, + ARCH_CONVERT)-1].hashval, ARCH_CONVERT); + + /* + * Adjust the expected index for insertion. + * NOTE: this code depends on the (current) situation that the + * second block was originally empty. + * + * If the insertion point moved to the 2nd block, we must adjust + * the index. We must also track the entry just following the + * new entry for use in an "atomic rename" operation, that entry + * is always the "old" entry and the "new" entry is what we are + * inserting. The index/blkno fields refer to the "old" entry, + * while the index2/blkno2 fields refer to the "new" entry. + */ + if (blk1->index > INT_GET(leaf1->hdr.count, ARCH_CONVERT)) { + ASSERT(state->inleaf == 0); + blk2->index = blk1->index + - INT_GET(leaf1->hdr.count, ARCH_CONVERT); + args->index = args->index2 = blk2->index; + args->blkno = args->blkno2 = blk2->blkno; + } else if (blk1->index == INT_GET(leaf1->hdr.count, ARCH_CONVERT)) { + if (state->inleaf) { + args->index = blk1->index; + args->blkno = blk1->blkno; + args->index2 = 0; + args->blkno2 = blk2->blkno; + } else { + blk2->index = blk1->index + - INT_GET(leaf1->hdr.count, ARCH_CONVERT); + args->index = args->index2 = blk2->index; + args->blkno = args->blkno2 = blk2->blkno; + } + } else { + ASSERT(state->inleaf == 1); + args->index = args->index2 = blk1->index; + args->blkno = args->blkno2 = blk1->blkno; + } +} + +/* + * Examine entries until we reduce the absolute difference in + * byte usage between the two blocks to a minimum. + * GROT: Is this really necessary? With other than a 512 byte blocksize, + * GROT: there will always be enough room in either block for a new entry. + * GROT: Do a double-split for this case? + */ +STATIC int +xfs_attr_leaf_figure_balance(xfs_da_state_t *state, + xfs_da_state_blk_t *blk1, + xfs_da_state_blk_t *blk2, + int *countarg, int *usedbytesarg) +{ + xfs_attr_leafblock_t *leaf1, *leaf2; + xfs_attr_leaf_hdr_t *hdr1, *hdr2; + xfs_attr_leaf_entry_t *entry; + int count, max, index, totallen, half; + int lastdelta, foundit, tmp; + + /* + * Set up environment. + */ + leaf1 = blk1->bp->data; + leaf2 = blk2->bp->data; + hdr1 = &leaf1->hdr; + hdr2 = &leaf2->hdr; + foundit = 0; + totallen = 0; + + /* + * Examine entries until we reduce the absolute difference in + * byte usage between the two blocks to a minimum. + */ + max = INT_GET(hdr1->count, ARCH_CONVERT) + + INT_GET(hdr2->count, ARCH_CONVERT); + half = (max+1) * sizeof(*entry); + half += INT_GET(hdr1->usedbytes, ARCH_CONVERT) + + INT_GET(hdr2->usedbytes, ARCH_CONVERT) + + xfs_attr_leaf_newentsize(state->args, + state->blocksize, NULL); + half /= 2; + lastdelta = state->blocksize; + entry = &leaf1->entries[0]; + for (count = index = 0; count < max; entry++, index++, count++) { + +#define XFS_ATTR_ABS(A) (((A) < 0) ? -(A) : (A)) + /* + * The new entry is in the first block, account for it. + */ + if (count == blk1->index) { + tmp = totallen + sizeof(*entry) + + xfs_attr_leaf_newentsize(state->args, + state->blocksize, + NULL); + if (XFS_ATTR_ABS(half - tmp) > lastdelta) + break; + lastdelta = XFS_ATTR_ABS(half - tmp); + totallen = tmp; + foundit = 1; + } + + /* + * Wrap around into the second block if necessary. + */ + if (count == INT_GET(hdr1->count, ARCH_CONVERT)) { + leaf1 = leaf2; + entry = &leaf1->entries[0]; + index = 0; + } + + /* + * Figure out if next leaf entry would be too much. + */ + tmp = totallen + sizeof(*entry) + xfs_attr_leaf_entsize(leaf1, + index); + if (XFS_ATTR_ABS(half - tmp) > lastdelta) + break; + lastdelta = XFS_ATTR_ABS(half - tmp); + totallen = tmp; +#undef XFS_ATTR_ABS + } + + /* + * Calculate the number of usedbytes that will end up in lower block. + * If new entry not in lower block, fix up the count. + */ + totallen -= count * sizeof(*entry); + if (foundit) { + totallen -= sizeof(*entry) + + xfs_attr_leaf_newentsize(state->args, + state->blocksize, + NULL); + } + + *countarg = count; + *usedbytesarg = totallen; + return(foundit); +} + +/*======================================================================== + * Routines used for shrinking the Btree. + *========================================================================*/ + +/* + * Check a leaf block and its neighbors to see if the block should be + * collapsed into one or the other neighbor. Always keep the block + * with the smaller block number. + * If the current block is over 50% full, don't try to join it, return 0. + * If the block is empty, fill in the state structure and return 2. + * If it can be collapsed, fill in the state structure and return 1. + * If nothing can be done, return 0. + * + * GROT: allow for INCOMPLETE entries in calculation. + */ +int +xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) +{ + xfs_attr_leafblock_t *leaf; + xfs_da_state_blk_t *blk; + xfs_da_blkinfo_t *info; + int count, bytes, forward, error, retval, i; + xfs_dablk_t blkno; + xfs_dabuf_t *bp; + + /* + * Check for the degenerate case of the block being over 50% full. + * If so, it's not worth even looking to see if we might be able + * to coalesce with a sibling. + */ + blk = &state->path.blk[ state->path.active-1 ]; + info = blk->bp->data; + ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC); + leaf = (xfs_attr_leafblock_t *)info; + count = INT_GET(leaf->hdr.count, ARCH_CONVERT); + bytes = sizeof(xfs_attr_leaf_hdr_t) + + count * sizeof(xfs_attr_leaf_entry_t) + + INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT); + if (bytes > (state->blocksize >> 1)) { + *action = 0; /* blk over 50%, dont try to join */ + return(0); + } + + /* + * Check for the degenerate case of the block being empty. + * If the block is empty, we'll simply delete it, no need to + * coalesce it with a sibling block. We choose (aribtrarily) + * to merge with the forward block unless it is NULL. + */ + if (count == 0) { + /* + * Make altpath point to the block we want to keep and + * path point to the block we want to drop (this one). + */ + forward = (!INT_ISZERO(info->forw, ARCH_CONVERT)); + bcopy(&state->path, &state->altpath, sizeof(state->path)); + error = xfs_da_path_shift(state, &state->altpath, forward, + 0, &retval); + if (error) + return(error); + if (retval) { + *action = 0; + } else { + *action = 2; + } + return(0); + } + + /* + * Examine each sibling block to see if we can coalesce with + * at least 25% free space to spare. We need to figure out + * whether to merge with the forward or the backward block. + * We prefer coalescing with the lower numbered sibling so as + * to shrink an attribute list over time. + */ + /* start with smaller blk num */ + forward = (INT_GET(info->forw, ARCH_CONVERT) + < INT_GET(info->back, ARCH_CONVERT)); + for (i = 0; i < 2; forward = !forward, i++) { + if (forward) + blkno = INT_GET(info->forw, ARCH_CONVERT); + else + blkno = INT_GET(info->back, ARCH_CONVERT); + if (blkno == 0) + continue; + error = xfs_da_read_buf(state->args->trans, state->args->dp, + blkno, -1, &bp, XFS_ATTR_FORK); + if (error) + return(error); + ASSERT(bp != NULL); + + leaf = (xfs_attr_leafblock_t *)info; + count = INT_GET(leaf->hdr.count, ARCH_CONVERT); + bytes = state->blocksize - (state->blocksize>>2); + bytes -= INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT); + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + count += INT_GET(leaf->hdr.count, ARCH_CONVERT); + bytes -= INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT); + bytes -= count * sizeof(xfs_attr_leaf_entry_t); + bytes -= sizeof(xfs_attr_leaf_hdr_t); + xfs_da_brelse(state->args->trans, bp); + if (bytes >= 0) + break; /* fits with at least 25% to spare */ + } + if (i >= 2) { + *action = 0; + return(0); + } + + /* + * Make altpath point to the block we want to keep (the lower + * numbered block) and path point to the block we want to drop. + */ + bcopy(&state->path, &state->altpath, sizeof(state->path)); + if (blkno < blk->blkno) { + error = xfs_da_path_shift(state, &state->altpath, forward, + 0, &retval); + } else { + error = xfs_da_path_shift(state, &state->path, forward, + 0, &retval); + } + if (error) + return(error); + if (retval) { + *action = 0; + } else { + *action = 1; + } + return(0); +} + +/* + * Remove a name from the leaf attribute list structure. + * + * Return 1 if leaf is less than 37% full, 0 if >= 37% full. + * If two leaves are 37% full, when combined they will leave 25% free. + */ +int +xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_hdr_t *hdr; + xfs_attr_leaf_map_t *map; + xfs_attr_leaf_entry_t *entry; + int before, after, smallest, entsize; + int tablesize, tmp, i; + xfs_mount_t *mp; + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + hdr = &leaf->hdr; + mp = args->trans->t_mountp; + ASSERT((INT_GET(hdr->count, ARCH_CONVERT) > 0) + && (INT_GET(hdr->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8))); + ASSERT((args->index >= 0) + && (args->index < INT_GET(hdr->count, ARCH_CONVERT))); + ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT) + >= ((INT_GET(hdr->count, ARCH_CONVERT) + * sizeof(*entry))+sizeof(*hdr))); + entry = &leaf->entries[args->index]; + ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) + >= INT_GET(hdr->firstused, ARCH_CONVERT)); + ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp)); + + /* + * Scan through free region table: + * check for adjacency of free'd entry with an existing one, + * find smallest free region in case we need to replace it, + * adjust any map that borders the entry table, + */ + tablesize = INT_GET(hdr->count, ARCH_CONVERT) + * sizeof(xfs_attr_leaf_entry_t) + + sizeof(xfs_attr_leaf_hdr_t); + map = &hdr->freemap[0]; + tmp = INT_GET(map->size, ARCH_CONVERT); + before = after = -1; + smallest = XFS_ATTR_LEAF_MAPSIZE - 1; + entsize = xfs_attr_leaf_entsize(leaf, args->index); + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) { + ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp)); + ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp)); + if (INT_GET(map->base, ARCH_CONVERT) == tablesize) { + INT_MOD(map->base, ARCH_CONVERT, + -sizeof(xfs_attr_leaf_entry_t)); + INT_MOD(map->size, ARCH_CONVERT, + sizeof(xfs_attr_leaf_entry_t)); + } + + if ((INT_GET(map->base, ARCH_CONVERT) + + INT_GET(map->size, ARCH_CONVERT)) + == INT_GET(entry->nameidx, ARCH_CONVERT)) { + before = i; + } else if (INT_GET(map->base, ARCH_CONVERT) + == (INT_GET(entry->nameidx, ARCH_CONVERT) + entsize)) { + after = i; + } else if (INT_GET(map->size, ARCH_CONVERT) < tmp) { + tmp = INT_GET(map->size, ARCH_CONVERT); + smallest = i; + } + } + + /* + * Coalesce adjacent freemap regions, + * or replace the smallest region. + */ + if ((before >= 0) || (after >= 0)) { + if ((before >= 0) && (after >= 0)) { + map = &hdr->freemap[before]; + INT_MOD(map->size, ARCH_CONVERT, entsize); + INT_MOD(map->size, ARCH_CONVERT, + INT_GET(hdr->freemap[after].size, + ARCH_CONVERT)); + INT_ZERO(hdr->freemap[after].base, ARCH_CONVERT); + INT_ZERO(hdr->freemap[after].size, ARCH_CONVERT); + } else if (before >= 0) { + map = &hdr->freemap[before]; + INT_MOD(map->size, ARCH_CONVERT, entsize); + } else { + map = &hdr->freemap[after]; + /* both on-disk, don't endian flip twice */ + map->base = entry->nameidx; + INT_MOD(map->size, ARCH_CONVERT, entsize); + } + } else { + /* + * Replace smallest region (if it is smaller than free'd entry) + */ + map = &hdr->freemap[smallest]; + if (INT_GET(map->size, ARCH_CONVERT) < entsize) { + INT_SET(map->base, ARCH_CONVERT, + INT_GET(entry->nameidx, ARCH_CONVERT)); + INT_SET(map->size, ARCH_CONVERT, entsize); + } + } + + /* + * Did we remove the first entry? + */ + if (INT_GET(entry->nameidx, ARCH_CONVERT) + == INT_GET(hdr->firstused, ARCH_CONVERT)) + smallest = 1; + else + smallest = 0; + + /* + * Compress the remaining entries and zero out the removed stuff. + */ + bzero(XFS_ATTR_LEAF_NAME(leaf, args->index), entsize); + INT_MOD(hdr->usedbytes, ARCH_CONVERT, -entsize); + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index), + entsize)); + + tmp = (INT_GET(hdr->count, ARCH_CONVERT) - args->index) + * sizeof(xfs_attr_leaf_entry_t); + ovbcopy((char *)(entry+1), (char *)entry, tmp); + INT_MOD(hdr->count, ARCH_CONVERT, -1); + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); + entry = &leaf->entries[INT_GET(hdr->count, ARCH_CONVERT)]; + bzero((char *)entry, sizeof(xfs_attr_leaf_entry_t)); + + /* + * If we removed the first entry, re-find the first used byte + * in the name area. Note that if the entry was the "firstused", + * then we don't have a "hole" in our block resulting from + * removing the name. + */ + if (smallest) { + tmp = XFS_LBSIZE(mp); + entry = &leaf->entries[0]; + for (i = INT_GET(hdr->count, ARCH_CONVERT)-1; + i >= 0; entry++, i--) { + ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) + >= INT_GET(hdr->firstused, ARCH_CONVERT)); + ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) + < XFS_LBSIZE(mp)); + if (INT_GET(entry->nameidx, ARCH_CONVERT) < tmp) + tmp = INT_GET(entry->nameidx, ARCH_CONVERT); + } + INT_SET(hdr->firstused, ARCH_CONVERT, tmp); + if (INT_ISZERO(hdr->firstused, ARCH_CONVERT)) { + INT_SET(hdr->firstused, ARCH_CONVERT, + tmp - XFS_ATTR_LEAF_NAME_ALIGN); + } + } else { + hdr->holes = 1; /* mark as needing compaction */ + } + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr))); + + /* + * Check if leaf is less than 50% full, caller may want to + * "join" the leaf with a sibling if so. + */ + tmp = sizeof(xfs_attr_leaf_hdr_t); + tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT) + * sizeof(xfs_attr_leaf_entry_t); + tmp += INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT); + return(tmp < mp->m_attr_magicpct); /* leaf is < 37% full */ +} + +/* + * Move all the attribute list entries from drop_leaf into save_leaf. + */ +void +xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, + xfs_da_state_blk_t *save_blk) +{ + xfs_attr_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf; + xfs_attr_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr; + xfs_mount_t *mp; + char *tmpbuffer; + + /* + * Set up environment. + */ + mp = state->mp; + ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC); + ASSERT(save_blk->magic == XFS_ATTR_LEAF_MAGIC); + drop_leaf = drop_blk->bp->data; + save_leaf = save_blk->bp->data; + ASSERT(INT_GET(drop_leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + ASSERT(INT_GET(save_leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + drop_hdr = &drop_leaf->hdr; + save_hdr = &save_leaf->hdr; + + /* + * Save last hashval from dying block for later Btree fixup. + */ + drop_blk->hashval = + INT_GET(drop_leaf->entries[INT_GET(drop_leaf->hdr.count, + ARCH_CONVERT)-1].hashval, + ARCH_CONVERT); + + /* + * Check if we need a temp buffer, or can we do it in place. + * Note that we don't check "leaf" for holes because we will + * always be dropping it, toosmall() decided that for us already. + */ + if (save_hdr->holes == 0) { + /* + * dest leaf has no holes, so we add there. May need + * to make some room in the entry array. + */ + if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) { + xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, 0, + (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp); + } else { + xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, + INT_GET(save_hdr->count, ARCH_CONVERT), + (int)INT_GET(drop_hdr->count, ARCH_CONVERT), + mp); + } + } else { + /* + * Destination has holes, so we make a temporary copy + * of the leaf and add them both to that. + */ + tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP); + ASSERT(tmpbuffer != NULL); + bzero(tmpbuffer, state->blocksize); + tmp_leaf = (xfs_attr_leafblock_t *)tmpbuffer; + tmp_hdr = &tmp_leaf->hdr; + tmp_hdr->info = save_hdr->info; /* struct copy */ + INT_ZERO(tmp_hdr->count, ARCH_CONVERT); + INT_SET(tmp_hdr->firstused, ARCH_CONVERT, state->blocksize); + if (INT_ISZERO(tmp_hdr->firstused, ARCH_CONVERT)) { + INT_SET(tmp_hdr->firstused, ARCH_CONVERT, + state->blocksize - XFS_ATTR_LEAF_NAME_ALIGN); + } + INT_ZERO(tmp_hdr->usedbytes, ARCH_CONVERT); + if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) { + xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, 0, + (int)INT_GET(drop_hdr->count, ARCH_CONVERT), + mp); + xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, + INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT), + (int)INT_GET(save_hdr->count, ARCH_CONVERT), + mp); + } else { + xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, 0, + (int)INT_GET(save_hdr->count, ARCH_CONVERT), + mp); + xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, + INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT), + (int)INT_GET(drop_hdr->count, ARCH_CONVERT), + mp); + } + bcopy((char *)tmp_leaf, (char *)save_leaf, state->blocksize); + kmem_free(tmpbuffer, state->blocksize); + } + + xfs_da_log_buf(state->args->trans, save_blk->bp, 0, + state->blocksize - 1); + + /* + * Copy out last hashval in each block for B-tree code. + */ + save_blk->hashval = + INT_GET(save_leaf->entries[INT_GET(save_leaf->hdr.count, + ARCH_CONVERT)-1].hashval, + ARCH_CONVERT); +} + +/*======================================================================== + * Routines used for finding things in the Btree. + *========================================================================*/ + +/* + * Look up a name in a leaf attribute list structure. + * This is the internal routine, it uses the caller's buffer. + * + * Note that duplicate keys are allowed, but only check within the + * current leaf node. The Btree code must check in adjacent leaf nodes. + * + * Return in args->index the index into the entry[] array of either + * the found entry, or where the entry should have been (insert before + * that entry). + * + * Don't change the args->value unless we find the attribute. + */ +int +xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_local_t *name_loc; + xfs_attr_leaf_name_remote_t *name_rmt; + int probe, span; + xfs_dahash_t hashval; + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) + < (XFS_LBSIZE(args->dp->i_mount)/8)); + + /* + * Binary search. (note: small blocks will skip this loop) + */ + hashval = args->hashval; + probe = span = INT_GET(leaf->hdr.count, ARCH_CONVERT) / 2; + for (entry = &leaf->entries[probe]; span > 4; + entry = &leaf->entries[probe]) { + span /= 2; + if (INT_GET(entry->hashval, ARCH_CONVERT) < hashval) + probe += span; + else if (INT_GET(entry->hashval, ARCH_CONVERT) > hashval) + probe -= span; + else + break; + } + ASSERT((probe >= 0) && \ + ((INT_ISZERO(leaf->hdr.count, ARCH_CONVERT)) + || (probe < INT_GET(leaf->hdr.count, ARCH_CONVERT)))); + ASSERT((span <= 4) || (INT_GET(entry->hashval, ARCH_CONVERT) + == hashval)); + + /* + * Since we may have duplicate hashval's, find the first matching + * hashval in the leaf. + */ + while ((probe > 0) && (INT_GET(entry->hashval, ARCH_CONVERT) + >= hashval)) { + entry--; + probe--; + } + while ((probe < INT_GET(leaf->hdr.count, ARCH_CONVERT)) + && (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)) { + entry++; + probe++; + } + if ((probe == INT_GET(leaf->hdr.count, ARCH_CONVERT)) + || (INT_GET(entry->hashval, ARCH_CONVERT) != hashval)) { + args->index = probe; + return(XFS_ERROR(ENOATTR)); + } + + /* + * Duplicate keys may be present, so search all of them for a match. + */ + for ( ; (probe < INT_GET(leaf->hdr.count, ARCH_CONVERT)) + && (INT_GET(entry->hashval, ARCH_CONVERT) == hashval); + entry++, probe++) { +/* + * GROT: Add code to remove incomplete entries. + */ + /* + * If we are looking for INCOMPLETE entries, show only those. + * If we are looking for complete entries, show only those. + */ + if ((args->flags & XFS_ATTR_INCOMPLETE) != + (entry->flags & XFS_ATTR_INCOMPLETE)) { + continue; + } + if (entry->flags & XFS_ATTR_LOCAL) { + name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, probe); + if (name_loc->namelen != args->namelen) + continue; + if (bcmp(args->name, (char *)name_loc->nameval, + args->namelen) != 0) + continue; + if (((args->flags & ATTR_ROOT) != 0) != + ((entry->flags & XFS_ATTR_ROOT) != 0)) + continue; + args->index = probe; + return(XFS_ERROR(EEXIST)); + } else { + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, probe); + if (name_rmt->namelen != args->namelen) + continue; + if (bcmp(args->name, (char *)name_rmt->name, + args->namelen) != 0) + continue; + if (((args->flags & ATTR_ROOT) != 0) != + ((entry->flags & XFS_ATTR_ROOT) != 0)) + continue; + args->index = probe; + args->rmtblkno + = INT_GET(name_rmt->valueblk, ARCH_CONVERT); + args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, + INT_GET(name_rmt->valuelen, + ARCH_CONVERT)); + return(XFS_ERROR(EEXIST)); + } + } + args->index = probe; + return(XFS_ERROR(ENOATTR)); +} + +/* + * Get the value associated with an attribute name from a leaf attribute + * list structure. + */ +int +xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args) +{ + int valuelen; + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_local_t *name_loc; + xfs_attr_leaf_name_remote_t *name_rmt; + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) + < (XFS_LBSIZE(args->dp->i_mount)/8)); + ASSERT(args->index < ((int)INT_GET(leaf->hdr.count, ARCH_CONVERT))); + + entry = &leaf->entries[args->index]; + if (entry->flags & XFS_ATTR_LOCAL) { + name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index); + ASSERT(name_loc->namelen == args->namelen); + ASSERT(bcmp(args->name, name_loc->nameval, args->namelen) == 0); + valuelen = INT_GET(name_loc->valuelen, ARCH_CONVERT); + if (args->flags & ATTR_KERNOVAL) { + args->valuelen = valuelen; + return(0); + } + if (args->valuelen < valuelen) { + args->valuelen = valuelen; + return(XFS_ERROR(ERANGE)); + } + args->valuelen = valuelen; + bcopy(&name_loc->nameval[args->namelen], args->value, valuelen); + } else { + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index); + ASSERT(name_rmt->namelen == args->namelen); + ASSERT(bcmp(args->name, name_rmt->name, args->namelen) == 0); + valuelen = INT_GET(name_rmt->valuelen, ARCH_CONVERT); + args->rmtblkno = INT_GET(name_rmt->valueblk, ARCH_CONVERT); + args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen); + if (args->flags & ATTR_KERNOVAL) { + args->valuelen = valuelen; + return(0); + } + if (args->valuelen < valuelen) { + args->valuelen = valuelen; + return(XFS_ERROR(ERANGE)); + } + args->valuelen = valuelen; + } + return(0); +} + +/*======================================================================== + * Utility routines. + *========================================================================*/ + +/* + * Move the indicated entries from one leaf to another. + * NOTE: this routine modifies both source and destination leaves. + */ +/*ARGSUSED*/ +STATIC void +xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, + xfs_attr_leafblock_t *leaf_d, int start_d, + int count, xfs_mount_t *mp) +{ + xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; + xfs_attr_leaf_entry_t *entry_s, *entry_d; + int desti, tmp, i; + + /* + * Check for nothing to do. + */ + if (count == 0) + return; + + /* + * Set up environment. + */ + ASSERT(INT_GET(leaf_s->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + ASSERT(INT_GET(leaf_d->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + hdr_s = &leaf_s->hdr; + hdr_d = &leaf_d->hdr; + ASSERT((INT_GET(hdr_s->count, ARCH_CONVERT) > 0) + && (INT_GET(hdr_s->count, ARCH_CONVERT) + < (XFS_LBSIZE(mp)/8))); + ASSERT(INT_GET(hdr_s->firstused, ARCH_CONVERT) >= + ((INT_GET(hdr_s->count, ARCH_CONVERT) + * sizeof(*entry_s))+sizeof(*hdr_s))); + ASSERT(INT_GET(hdr_d->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8)); + ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >= + ((INT_GET(hdr_d->count, ARCH_CONVERT) + * sizeof(*entry_d))+sizeof(*hdr_d))); + + ASSERT(start_s < INT_GET(hdr_s->count, ARCH_CONVERT)); + ASSERT(start_d <= INT_GET(hdr_d->count, ARCH_CONVERT)); + ASSERT(count <= INT_GET(hdr_s->count, ARCH_CONVERT)); + + /* + * Move the entries in the destination leaf up to make a hole? + */ + if (start_d < INT_GET(hdr_d->count, ARCH_CONVERT)) { + tmp = INT_GET(hdr_d->count, ARCH_CONVERT) - start_d; + tmp *= sizeof(xfs_attr_leaf_entry_t); + entry_s = &leaf_d->entries[start_d]; + entry_d = &leaf_d->entries[start_d + count]; + ovbcopy((char *)entry_s, (char *)entry_d, tmp); + } + + /* + * Copy all entry's in the same (sorted) order, + * but allocate attribute info packed and in sequence. + */ + entry_s = &leaf_s->entries[start_s]; + entry_d = &leaf_d->entries[start_d]; + desti = start_d; + for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) { + ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) + >= INT_GET(hdr_s->firstused, ARCH_CONVERT)); + tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i); +#ifdef GROT + /* + * Code to drop INCOMPLETE entries. Difficult to use as we + * may also need to change the insertion index. Code turned + * off for 6.2, should be revisited later. + */ + if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */ + bzero(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), tmp); + INT_MOD(hdr_s->usedbytes, ARCH_CONVERT, -tmp); + INT_MOD(hdr_s->count, ARCH_CONVERT, -1); + entry_d--; /* to compensate for ++ in loop hdr */ + desti--; + if ((start_s + i) < offset) + result++; /* insertion index adjustment */ + } else { +#endif /* GROT */ + INT_MOD(hdr_d->firstused, ARCH_CONVERT, -tmp); + /* both on-disk, don't endian flip twice */ + entry_d->hashval = entry_s->hashval; + /* both on-disk, don't endian flip twice */ + entry_d->nameidx = hdr_d->firstused; + entry_d->flags = entry_s->flags; + ASSERT(INT_GET(entry_d->nameidx, ARCH_CONVERT) + tmp + <= XFS_LBSIZE(mp)); + ovbcopy(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), + XFS_ATTR_LEAF_NAME(leaf_d, desti), tmp); + ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) + tmp + <= XFS_LBSIZE(mp)); + bzero(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), tmp); + INT_MOD(hdr_s->usedbytes, ARCH_CONVERT, -tmp); + INT_MOD(hdr_d->usedbytes, ARCH_CONVERT, tmp); + INT_MOD(hdr_s->count, ARCH_CONVERT, -1); + INT_MOD(hdr_d->count, ARCH_CONVERT, 1); + tmp = INT_GET(hdr_d->count, ARCH_CONVERT) + * sizeof(xfs_attr_leaf_entry_t) + + sizeof(xfs_attr_leaf_hdr_t); + ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >= tmp); +#ifdef GROT + } +#endif /* GROT */ + } + + /* + * Zero out the entries we just copied. + */ + if (start_s == INT_GET(hdr_s->count, ARCH_CONVERT)) { + tmp = count * sizeof(xfs_attr_leaf_entry_t); + entry_s = &leaf_s->entries[start_s]; + ASSERT(((char *)entry_s + tmp) <= + ((char *)leaf_s + XFS_LBSIZE(mp))); + bzero((char *)entry_s, tmp); + } else { + /* + * Move the remaining entries down to fill the hole, + * then zero the entries at the top. + */ + tmp = INT_GET(hdr_s->count, ARCH_CONVERT) - count; + tmp *= sizeof(xfs_attr_leaf_entry_t); + entry_s = &leaf_s->entries[start_s + count]; + entry_d = &leaf_s->entries[start_s]; + ovbcopy((char *)entry_s, (char *)entry_d, tmp); + + tmp = count * sizeof(xfs_attr_leaf_entry_t); + entry_s = &leaf_s->entries[INT_GET(hdr_s->count, + ARCH_CONVERT)]; + ASSERT(((char *)entry_s + tmp) <= + ((char *)leaf_s + XFS_LBSIZE(mp))); + bzero((char *)entry_s, tmp); + } + + /* + * Fill in the freemap information + */ + INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT, + sizeof(xfs_attr_leaf_hdr_t)); + INT_MOD(hdr_d->freemap[0].base, ARCH_CONVERT, + INT_GET(hdr_d->count, ARCH_CONVERT) + * sizeof(xfs_attr_leaf_entry_t)); + INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT, + INT_GET(hdr_d->firstused, ARCH_CONVERT) + - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT)); + INT_ZERO(hdr_d->freemap[1].base, ARCH_CONVERT); + INT_ZERO(hdr_d->freemap[2].base, ARCH_CONVERT); + INT_ZERO(hdr_d->freemap[1].size, ARCH_CONVERT); + INT_ZERO(hdr_d->freemap[2].size, ARCH_CONVERT); + hdr_s->holes = 1; /* leaf may not be compact */ +} + +/* + * Compare two leaf blocks "order". + * Return 0 unless leaf2 should go before leaf1. + */ +int +xfs_attr_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp) +{ + xfs_attr_leafblock_t *leaf1, *leaf2; + + leaf1 = leaf1_bp->data; + leaf2 = leaf2_bp->data; + ASSERT((INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC) && + (INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC)); + if ( (INT_GET(leaf1->hdr.count, ARCH_CONVERT) > 0) + && (INT_GET(leaf2->hdr.count, ARCH_CONVERT) > 0) + && ( (INT_GET(leaf2->entries[ 0 ].hashval, ARCH_CONVERT) < + INT_GET(leaf1->entries[ 0 ].hashval, ARCH_CONVERT)) + || (INT_GET(leaf2->entries[INT_GET(leaf2->hdr.count, + ARCH_CONVERT)-1].hashval, ARCH_CONVERT) < + INT_GET(leaf1->entries[INT_GET(leaf1->hdr.count, + ARCH_CONVERT)-1].hashval, ARCH_CONVERT))) ) { + return(1); + } + return(0); +} + +/* + * Pick up the last hashvalue from a leaf block. + */ +xfs_dahash_t +xfs_attr_leaf_lasthash(xfs_dabuf_t *bp, int *count) +{ + xfs_attr_leafblock_t *leaf; + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + if (count) + *count = INT_GET(leaf->hdr.count, ARCH_CONVERT); + if (INT_ISZERO(leaf->hdr.count, ARCH_CONVERT)) + return(0); + return(INT_GET(leaf->entries[INT_GET(leaf->hdr.count, + ARCH_CONVERT)-1].hashval, ARCH_CONVERT)); +} + +/* + * Calculate the number of bytes used to store the indicated attribute + * (whether local or remote only calculate bytes in this block). + */ +int +xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index) +{ + xfs_attr_leaf_name_local_t *name_loc; + xfs_attr_leaf_name_remote_t *name_rmt; + int size; + + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + if (leaf->entries[index].flags & XFS_ATTR_LOCAL) { + name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, index); + size = XFS_ATTR_LEAF_ENTSIZE_LOCAL(name_loc->namelen, + INT_GET(name_loc->valuelen, + ARCH_CONVERT)); + } else { + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, index); + size = XFS_ATTR_LEAF_ENTSIZE_REMOTE(name_rmt->namelen); + } + return(size); +} + +/* + * Calculate the number of bytes that would be required to store the new + * attribute (whether local or remote only calculate bytes in this block). + * This routine decides as a side effect whether the attribute will be + * a "local" or a "remote" attribute. + */ +int +xfs_attr_leaf_newentsize(xfs_da_args_t *args, int blocksize, int *local) +{ + int size; + + size = XFS_ATTR_LEAF_ENTSIZE_LOCAL(args->namelen, args->valuelen); + if (size < XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(blocksize)) { + if (local) { + *local = 1; + } + } else { + size = XFS_ATTR_LEAF_ENTSIZE_REMOTE(args->namelen); + if (local) { + *local = 0; + } + } + return(size); +} + +/* + * Copy out attribute list entries for attr_list(), for leaf attribute lists. + */ +int +xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) +{ + attrlist_cursor_kern_t *cursor; + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_local_t *name_loc; + xfs_attr_leaf_name_remote_t *name_rmt; + int retval, i; + + ASSERT(bp != NULL); + leaf = bp->data; + cursor = context->cursor; + cursor->initted = 1; + + xfs_attr_trace_l_cl("blk start", context, leaf); + + /* + * Re-find our place in the leaf block if this is a new syscall. + */ + if (context->resynch) { + entry = &leaf->entries[0]; + for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); + entry++, i++) { + if (INT_GET(entry->hashval, ARCH_CONVERT) + == cursor->hashval) { + if (cursor->offset == context->dupcnt) { + context->dupcnt = 0; + break; + } + context->dupcnt++; + } else if (INT_GET(entry->hashval, ARCH_CONVERT) + > cursor->hashval) { + context->dupcnt = 0; + break; + } + } + if (i == INT_GET(leaf->hdr.count, ARCH_CONVERT)) { + xfs_attr_trace_l_c("not found", context); + return(0); + } + } else { + entry = &leaf->entries[0]; + i = 0; + } + context->resynch = 0; + + /* + * We have found our place, start copying out the new attributes. + */ + retval = 0; + for ( ; (i < INT_GET(leaf->hdr.count, ARCH_CONVERT)) + && (retval == 0); entry++, i++) { + int ns = (entry->flags & XFS_ATTR_ROOT)? ROOT_NAMES:USER_NAMES; + + if (INT_GET(entry->hashval, ARCH_CONVERT) != cursor->hashval) { + cursor->hashval = INT_GET(entry->hashval, ARCH_CONVERT); + cursor->offset = 0; + } + + if (entry->flags & XFS_ATTR_INCOMPLETE) + continue; /* skip incomplete entries */ + if (((context->flags & ATTR_ROOT) != 0) != + ((entry->flags & XFS_ATTR_ROOT) != 0) && + !(context->flags & ATTR_KERNFULLS)) + continue; /* skip non-matching entries */ + + if (entry->flags & XFS_ATTR_LOCAL) { + name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i); + if (context->flags & ATTR_KERNOVAL) { + ASSERT(context->flags & ATTR_KERNAMELS); + context->count += xfs_namespaces[ns].namelen + + (int)name_loc->namelen + 1; + } else { + retval = xfs_attr_put_listent(context, ns, + (char *)name_loc->nameval, + (int)name_loc->namelen, + (int)INT_GET(name_loc->valuelen, + ARCH_CONVERT)); + } + } else { + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i); + if (context->flags & ATTR_KERNOVAL) { + ASSERT(context->flags & ATTR_KERNAMELS); + context->count += xfs_namespaces[ns].namelen + + (int)name_rmt->namelen + 1; + } else { + retval = xfs_attr_put_listent(context, ns, + (char *)name_rmt->name, + (int)name_rmt->namelen, + (int)INT_GET(name_rmt->valuelen, + ARCH_CONVERT)); + } + } + if (retval == 0) { + cursor->offset++; + } + } + xfs_attr_trace_l_cl("blk end", context, leaf); + return(retval); +} + +#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \ + (((struct attrlist_ent *) 0)->a_name - (char *) 0) +#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \ + ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \ + & ~(sizeof(u_int32_t)-1)) + +/* + * Format an attribute and copy it out to the user's buffer. + * Take care to check values and protect against them changing later, + * we may be reading them directly out of a user buffer. + */ +/*ARGSUSED*/ +int +xfs_attr_put_listent(xfs_attr_list_context_t *context, + int ns, char *name, int namelen, int valuelen) +{ + attrlist_ent_t *aep; + int arraytop; + + ASSERT(!(context->flags & ATTR_KERNOVAL)); + if (context->flags & ATTR_KERNAMELS) { + char *offset; + xattr_namespace_t *nsp; + + ASSERT(context->count >= 0); + + nsp = &xfs_namespaces[ns]; + arraytop = context->count + nsp->namelen + namelen+1; + if (arraytop > context->firstu) { + context->count = -1; /* insufficient space */ + return(1); + } + offset = (char *)context->alist + context->count; + strncpy(offset, nsp->name, nsp->namelen); /* namespace */ + offset += nsp->namelen; + strncpy(offset, name, namelen); /* real name */ + offset += namelen; + *offset = '\0'; + context->count += nsp->namelen + namelen + 1; + return(0); + } + + ASSERT(context->count >= 0); + ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); + ASSERT(context->firstu >= sizeof(*context->alist)); + ASSERT(context->firstu <= context->bufsize); + + arraytop = sizeof(*context->alist) + + context->count * sizeof(context->alist->al_offset[0]); + context->firstu -= ATTR_ENTSIZE(namelen); + if (context->firstu < arraytop) { + xfs_attr_trace_l_c("buffer full", context); + context->alist->al_more = 1; + return(1); + } + + aep = (attrlist_ent_t *)&(((char *)context->alist)[ context->firstu ]); + aep->a_valuelen = valuelen; + bcopy(name, aep->a_name, namelen); + aep->a_name[ namelen ] = 0; + context->alist->al_offset[ context->count++ ] = context->firstu; + context->alist->al_count = context->count; + xfs_attr_trace_l_c("add", context); + return(0); +} + +/*======================================================================== + * Manage the INCOMPLETE flag in a leaf entry + *========================================================================*/ + +/* + * Clear the INCOMPLETE flag on an entry in a leaf block. + */ +int +xfs_attr_leaf_clearflag(xfs_da_args_t *args) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_remote_t *name_rmt; + xfs_dabuf_t *bp; + int error; +#ifdef DEBUG + xfs_attr_leaf_name_local_t *name_loc; + int namelen; + char *name; +#endif /* DEBUG */ + + /* + * Set up the operation. + */ + error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) { + return(error); + } + ASSERT(bp != NULL); + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + ASSERT(args->index < INT_GET(leaf->hdr.count, ARCH_CONVERT)); + ASSERT(args->index >= 0); + entry = &leaf->entries[ args->index ]; + ASSERT(entry->flags & XFS_ATTR_INCOMPLETE); + +#ifdef DEBUG + if (entry->flags & XFS_ATTR_LOCAL) { + name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index); + namelen = name_loc->namelen; + name = (char *)name_loc->nameval; + } else { + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index); + namelen = name_rmt->namelen; + name = (char *)name_rmt->name; + } + ASSERT(INT_GET(entry->hashval, ARCH_CONVERT) == args->hashval); + ASSERT(namelen == args->namelen); + ASSERT(bcmp(name, args->name, namelen) == 0); +#endif /* DEBUG */ + + entry->flags &= ~XFS_ATTR_INCOMPLETE; + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); + + if (args->rmtblkno) { + ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0); + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index); + INT_SET(name_rmt->valueblk, ARCH_CONVERT, args->rmtblkno); + INT_SET(name_rmt->valuelen, ARCH_CONVERT, args->valuelen); + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); + } + xfs_da_buf_done(bp); + + /* + * Commit the flag value change and start the next trans in series. + */ + error = xfs_attr_rolltrans(&args->trans, args->dp); + + return(error); +} + +/* + * Set the INCOMPLETE flag on an entry in a leaf block. + */ +int +xfs_attr_leaf_setflag(xfs_da_args_t *args) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_remote_t *name_rmt; + xfs_dabuf_t *bp; + int error; + + /* + * Set up the operation. + */ + error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) { + return(error); + } + ASSERT(bp != NULL); + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + ASSERT(args->index < INT_GET(leaf->hdr.count, ARCH_CONVERT)); + ASSERT(args->index >= 0); + entry = &leaf->entries[ args->index ]; + + ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0); + entry->flags |= XFS_ATTR_INCOMPLETE; + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); + if ((entry->flags & XFS_ATTR_LOCAL) == 0) { + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index); + INT_ZERO(name_rmt->valueblk, ARCH_CONVERT); + INT_ZERO(name_rmt->valuelen, ARCH_CONVERT); + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); + } + xfs_da_buf_done(bp); + + /* + * Commit the flag value change and start the next trans in series. + */ + error = xfs_attr_rolltrans(&args->trans, args->dp); + + return(error); +} + +/* + * In a single transaction, clear the INCOMPLETE flag on the leaf entry + * given by args->blkno/index and set the INCOMPLETE flag on the leaf + * entry given by args->blkno2/index2. + * + * Note that they could be in different blocks, or in the same block. + */ +int +xfs_attr_leaf_flipflags(xfs_da_args_t *args) +{ + xfs_attr_leafblock_t *leaf1, *leaf2; + xfs_attr_leaf_entry_t *entry1, *entry2; + xfs_attr_leaf_name_remote_t *name_rmt; + xfs_dabuf_t *bp1, *bp2; + int error; +#ifdef DEBUG + xfs_attr_leaf_name_local_t *name_loc; + int namelen1, namelen2; + char *name1, *name2; +#endif /* DEBUG */ + + /* + * Read the block containing the "old" attr + */ + error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1, + XFS_ATTR_FORK); + if (error) { + return(error); + } + ASSERT(bp1 != NULL); + + /* + * Read the block containing the "new" attr, if it is different + */ + if (args->blkno2 != args->blkno) { + error = xfs_da_read_buf(args->trans, args->dp, args->blkno2, + -1, &bp2, XFS_ATTR_FORK); + if (error) { + return(error); + } + ASSERT(bp2 != NULL); + } else { + bp2 = bp1; + } + + leaf1 = bp1->data; + ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + ASSERT(args->index < INT_GET(leaf1->hdr.count, ARCH_CONVERT)); + ASSERT(args->index >= 0); + entry1 = &leaf1->entries[ args->index ]; + + leaf2 = bp2->data; + ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + ASSERT(args->index2 < INT_GET(leaf2->hdr.count, ARCH_CONVERT)); + ASSERT(args->index2 >= 0); + entry2 = &leaf2->entries[ args->index2 ]; + +#ifdef DEBUG + if (entry1->flags & XFS_ATTR_LOCAL) { + name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf1, args->index); + namelen1 = name_loc->namelen; + name1 = (char *)name_loc->nameval; + } else { + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf1, args->index); + namelen1 = name_rmt->namelen; + name1 = (char *)name_rmt->name; + } + if (entry2->flags & XFS_ATTR_LOCAL) { + name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf2, args->index2); + namelen2 = name_loc->namelen; + name2 = (char *)name_loc->nameval; + } else { + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf2, args->index2); + namelen2 = name_rmt->namelen; + name2 = (char *)name_rmt->name; + } + ASSERT(INT_GET(entry1->hashval, ARCH_CONVERT) == INT_GET(entry2->hashval, ARCH_CONVERT)); + ASSERT(namelen1 == namelen2); + ASSERT(bcmp(name1, name2, namelen1) == 0); +#endif /* DEBUG */ + + ASSERT(entry1->flags & XFS_ATTR_INCOMPLETE); + ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0); + + entry1->flags &= ~XFS_ATTR_INCOMPLETE; + xfs_da_log_buf(args->trans, bp1, + XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1))); + if (args->rmtblkno) { + ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0); + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf1, args->index); + INT_SET(name_rmt->valueblk, ARCH_CONVERT, args->rmtblkno); + INT_SET(name_rmt->valuelen, ARCH_CONVERT, args->valuelen); + xfs_da_log_buf(args->trans, bp1, + XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt))); + } + + entry2->flags |= XFS_ATTR_INCOMPLETE; + xfs_da_log_buf(args->trans, bp2, + XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2))); + if ((entry2->flags & XFS_ATTR_LOCAL) == 0) { + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf2, args->index2); + INT_ZERO(name_rmt->valueblk, ARCH_CONVERT); + INT_ZERO(name_rmt->valuelen, ARCH_CONVERT); + xfs_da_log_buf(args->trans, bp2, + XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt))); + } + xfs_da_buf_done(bp1); + if (bp1 != bp2) + xfs_da_buf_done(bp2); + + /* + * Commit the flag value change and start the next trans in series. + */ + error = xfs_attr_rolltrans(&args->trans, args->dp); + + return(error); +} + +/*======================================================================== + * Indiscriminately delete the entire attribute fork + *========================================================================*/ + +/* + * Recurse (gasp!) through the attribute nodes until we find leaves. + * We're doing a depth-first traversal in order to invalidate everything. + */ +int +xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) +{ + xfs_da_blkinfo_t *info; + xfs_daddr_t blkno; + xfs_dabuf_t *bp; + int error; + + /* + * Read block 0 to see what we have to work with. + * We only get here if we have extents, since we remove + * the extents in reverse order the extent containing + * block 0 must still be there. + */ + error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); + if (error) + return(error); + blkno = xfs_da_blkno(bp); + + /* + * Invalidate the tree, even if the "tree" is only a single leaf block. + * This is a depth-first traversal! + */ + info = bp->data; + if (INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) { + error = xfs_attr_node_inactive(trans, dp, bp, 1); + } else if (INT_GET(info->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC) { + error = xfs_attr_leaf_inactive(trans, dp, bp); + } else { + error = XFS_ERROR(EIO); + xfs_da_brelse(*trans, bp); + } + if (error) + return(error); + + /* + * Invalidate the incore copy of the root block. + */ + error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK); + if (error) + return(error); + xfs_da_binval(*trans, bp); /* remove from cache */ + /* + * Commit the invalidate and start the next transaction. + */ + error = xfs_attr_rolltrans(trans, dp); + + return (error); +} + +/* + * Recurse (gasp!) through the attribute nodes until we find leaves. + * We're doing a depth-first traversal in order to invalidate everything. + */ +int +xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, + int level) +{ + xfs_da_blkinfo_t *info; + xfs_da_intnode_t *node; + xfs_dablk_t child_fsb; + xfs_daddr_t parent_blkno, child_blkno; + int error, count, i; + xfs_dabuf_t *child_bp; + + /* + * Since this code is recursive (gasp!) we must protect ourselves. + */ + if (level > XFS_DA_NODE_MAXDEPTH) { + xfs_da_brelse(*trans, bp); /* no locks for later trans */ + return(XFS_ERROR(EIO)); + } + + node = bp->data; + ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) + == XFS_DA_NODE_MAGIC); + parent_blkno = xfs_da_blkno(bp); /* save for re-read later */ + count = INT_GET(node->hdr.count, ARCH_CONVERT); + if (!count) { + xfs_da_brelse(*trans, bp); + return(0); + } + child_fsb = INT_GET(node->btree[0].before, ARCH_CONVERT); + xfs_da_brelse(*trans, bp); /* no locks for later trans */ + + /* + * If this is the node level just above the leaves, simply loop + * over the leaves removing all of them. If this is higher up + * in the tree, recurse downward. + */ + for (i = 0; i < count; i++) { + /* + * Read the subsidiary block to see what we have to work with. + * Don't do this in a transaction. This is a depth-first + * traversal of the tree so we may deal with many blocks + * before we come back to this one. + */ + error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp, + XFS_ATTR_FORK); + if (error) + return(error); + if (child_bp) { + /* save for re-read later */ + child_blkno = xfs_da_blkno(child_bp); + + /* + * Invalidate the subtree, however we have to. + */ + info = child_bp->data; + if (INT_GET(info->magic, ARCH_CONVERT) + == XFS_DA_NODE_MAGIC) { + error = xfs_attr_node_inactive(trans, dp, + child_bp, level+1); + } else if (INT_GET(info->magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC) { + error = xfs_attr_leaf_inactive(trans, dp, + child_bp); + } else { + error = XFS_ERROR(EIO); + xfs_da_brelse(*trans, child_bp); + } + if (error) + return(error); + + /* + * Remove the subsidiary block from the cache + * and from the log. + */ + error = xfs_da_get_buf(*trans, dp, 0, child_blkno, + &child_bp, XFS_ATTR_FORK); + if (error) + return(error); + xfs_da_binval(*trans, child_bp); + } + + /* + * If we're not done, re-read the parent to get the next + * child block number. + */ + if ((i+1) < count) { + error = xfs_da_read_buf(*trans, dp, 0, parent_blkno, + &bp, XFS_ATTR_FORK); + if (error) + return(error); + child_fsb = INT_GET(node->btree[i+1].before, ARCH_CONVERT); + xfs_da_brelse(*trans, bp); + } + /* + * Atomically commit the whole invalidate stuff. + */ + if ((error = xfs_attr_rolltrans(trans, dp))) + return (error); + } + + return(0); +} + +/* + * Invalidate all of the "remote" value regions pointed to by a particular + * leaf block. + * Note that we must release the lock on the buffer so that we are not + * caught holding something that the logging code wants to flush to disk. + */ +int +xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_remote_t *name_rmt; + xfs_attr_inactive_list_t *list, *lp; + int error, count, size, tmp, i; + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + + /* + * Count the number of "remote" value extents. + */ + count = 0; + entry = &leaf->entries[0]; + for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) { + if ( INT_GET(entry->nameidx, ARCH_CONVERT) + && ((entry->flags & XFS_ATTR_LOCAL) == 0)) { + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i); + if (!INT_ISZERO(name_rmt->valueblk, ARCH_CONVERT)) + count++; + } + } + + /* + * If there are no "remote" values, we're done. + */ + if (count == 0) { + xfs_da_brelse(*trans, bp); + return(0); + } + + /* + * Allocate storage for a list of all the "remote" value extents. + */ + size = count * sizeof(xfs_attr_inactive_list_t); + list = (xfs_attr_inactive_list_t *)kmem_alloc(size, KM_SLEEP); + + /* + * Identify each of the "remote" value extents. + */ + lp = list; + entry = &leaf->entries[0]; + for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) { + if ( INT_GET(entry->nameidx, ARCH_CONVERT) + && ((entry->flags & XFS_ATTR_LOCAL) == 0)) { + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i); + if (!INT_ISZERO(name_rmt->valueblk, ARCH_CONVERT)) { + /* both on-disk, don't endian flip twice */ + lp->valueblk = name_rmt->valueblk; + INT_SET(lp->valuelen, ARCH_CONVERT, + XFS_B_TO_FSB(dp->i_mount, + INT_GET(name_rmt->valuelen, + ARCH_CONVERT))); + lp++; + } + } + } + xfs_da_brelse(*trans, bp); /* unlock for trans. in freextent() */ + + /* + * Invalidate each of the "remote" value extents. + */ + error = 0; + for (lp = list, i = 0; i < count; i++, lp++) { + tmp = xfs_attr_leaf_freextent(trans, dp, + INT_GET(lp->valueblk, + ARCH_CONVERT), + INT_GET(lp->valuelen, + ARCH_CONVERT)); + if (error == 0) + error = tmp; /* save only the 1st errno */ + } + + kmem_free((xfs_caddr_t)list, size); + return(error); +} + +/* + * Look at all the extents for this logical region, + * invalidate any buffers that are incore/in transactions. + */ +int +xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, + xfs_dablk_t blkno, int blkcnt) +{ + xfs_bmbt_irec_t map; + xfs_dablk_t tblkno; + int tblkcnt, dblkcnt, nmap, error; + xfs_daddr_t dblkno; + xfs_buf_t *bp; + + /* + * Roll through the "value", invalidating the attribute value's + * blocks. + */ + tblkno = blkno; + tblkcnt = blkcnt; + while (tblkcnt > 0) { + /* + * Try to remember where we decided to put the value. + */ + nmap = 1; + error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + NULL, 0, &map, &nmap, NULL); + if (error) { + return(error); + } + ASSERT(nmap == 1); + ASSERT(map.br_startblock != DELAYSTARTBLOCK); + + /* + * If it's a hole, these are already unmapped + * so there's nothing to invalidate. + */ + if (map.br_startblock != HOLESTARTBLOCK) { + + dblkno = XFS_FSB_TO_DADDR(dp->i_mount, + map.br_startblock); + dblkcnt = XFS_FSB_TO_BB(dp->i_mount, + map.br_blockcount); + bp = xfs_trans_get_buf(*trans, + dp->i_mount->m_ddev_targp, + dblkno, dblkcnt, 0); + xfs_trans_binval(*trans, bp); + /* + * Roll to next transaction. + */ + if ((error = xfs_attr_rolltrans(trans, dp))) + return (error); + } + + tblkno += map.br_blockcount; + tblkcnt -= map.br_blockcount; + } + + return(0); +} + + +/* + * Roll from one trans in the sequence of PERMANENT transactions to the next. + */ +int +xfs_attr_rolltrans(xfs_trans_t **transp, xfs_inode_t *dp) +{ + xfs_trans_t *trans; + unsigned int logres, count; + int error; + + /* + * Ensure that the inode is always logged. + */ + trans = *transp; + xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); + + /* + * Copy the critical parameters from one trans to the next. + */ + logres = trans->t_log_res; + count = trans->t_log_count; + *transp = xfs_trans_dup(trans); + + /* + * Commit the current transaction. + * If this commit failed, then it'd just unlock those items that + * are not marked ihold. That also means that a filesystem shutdown + * is in progress. The caller takes the responsibility to cancel + * the duplicate transaction that gets returned. + */ + if ((error = xfs_trans_commit(trans, 0, NULL))) + return (error); + + trans = *transp; + + /* + * Reserve space in the log for th next transaction. + * This also pushes items in the "AIL", the list of logged items, + * out to disk if they are taking up space at the tail of the log + * that we want to use. This requires that either nothing be locked + * across this call, or that anything that is locked be logged in + * the prior and the next transactions. + */ + error = xfs_trans_reserve(trans, 0, logres, 0, + XFS_TRANS_PERM_LOG_RES, count); + /* + * Ensure that the inode is in the new transaction and locked. + */ + if (!error) { + xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(trans, dp); + } + return (error); + +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_attr_leaf.h linux-2.4-xfs/fs/xfs/xfs_attr_leaf.h --- linux-2.4.19/fs/xfs/xfs_attr_leaf.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_attr_leaf.h Wed Jul 10 23:13:51 2002 @@ -0,0 +1,305 @@ +/* + * Copyright (c) 2000, 2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_ATTR_LEAF_H__ +#define __XFS_ATTR_LEAF_H__ + +/* + * Attribute storage layout, internal structure, access macros, etc. + * + * Attribute lists are structured around Btrees where all the data + * elements are in the leaf nodes. Attribute names are hashed into an int, + * then that int is used as the index into the Btree. Since the hashval + * of an attribute name may not be unique, we may have duplicate keys. The + * internal links in the Btree are logical block offsets into the file. + */ + +struct attrlist; +struct attrlist_cursor_kern; +struct xfs_dabuf; +struct xfs_da_args; +struct xfs_da_state; +struct xfs_da_state_blk; +struct xfs_inode; +struct xfs_trans; + +/*======================================================================== + * Attribute structure when equal to XFS_LBSIZE(mp) bytes. + *========================================================================*/ + +/* + * This is the structure of the leaf nodes in the Btree. + * + * Struct leaf_entry's are packed from the top. Name/values grow from the + * bottom but are not packed. The freemap contains run-length-encoded entries + * for the free bytes after the leaf_entry's, but only the N largest such, + * smaller runs are dropped. When the freemap doesn't show enough space + * for an allocation, we compact the name/value area and try again. If we + * still don't have enough space, then we have to split the block. The + * name/value structs (both local and remote versions) must be 32bit aligned. + * + * Since we have duplicate hash keys, for each key that matches, compare + * the actual name string. The root and intermediate node search always + * takes the first-in-the-block key match found, so we should only have + * to work "forw"ard. If none matches, continue with the "forw"ard leaf + * nodes until the hash key changes or the attribute name is found. + * + * We store the fact that an attribute is a ROOT versus USER attribute in + * the leaf_entry. The namespaces are independent only because we also look + * at the root/user bit when we are looking for a matching attribute name. + * + * We also store a "incomplete" bit in the leaf_entry. It shows that an + * attribute is in the middle of being created and should not be shown to + * the user if we crash during the time that the bit is set. We clear the + * bit when we have finished setting up the attribute. We do this because + * we cannot create some large attributes inside a single transaction, and we + * need some indication that we weren't finished if we crash in the middle. + */ +#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */ + +typedef struct xfs_attr_leafblock { + struct xfs_attr_leaf_hdr { /* constant-structure header block */ + xfs_da_blkinfo_t info; /* block type, links, etc. */ + __uint16_t count; /* count of active leaf_entry's */ + __uint16_t usedbytes; /* num bytes of names/values stored */ + __uint16_t firstused; /* first used byte in name area */ + __uint8_t holes; /* != 0 if blk needs compaction */ + __uint8_t pad1; + struct xfs_attr_leaf_map { /* RLE map of free bytes */ + __uint16_t base; /* base of free region */ + __uint16_t size; /* length of free region */ + } freemap[XFS_ATTR_LEAF_MAPSIZE]; /* N largest free regions */ + } hdr; + struct xfs_attr_leaf_entry { /* sorted on key, not name */ + xfs_dahash_t hashval; /* hash value of name */ + __uint16_t nameidx; /* index into buffer of name/value */ + __uint8_t flags; /* LOCAL, ROOT and INCOMPLETE flags */ + __uint8_t pad2; /* unused pad byte */ + } entries[1]; /* variable sized array */ + struct xfs_attr_leaf_name_local { + __uint16_t valuelen; /* number of bytes in value */ + __uint8_t namelen; /* length of name bytes */ + __uint8_t nameval[1]; /* name/value bytes */ + } namelist; /* grows from bottom of buf */ + struct xfs_attr_leaf_name_remote { + xfs_dablk_t valueblk; /* block number of value bytes */ + __uint32_t valuelen; /* number of bytes in value */ + __uint8_t namelen; /* length of name bytes */ + __uint8_t name[1]; /* name bytes */ + } valuelist; /* grows from bottom of buf */ +} xfs_attr_leafblock_t; +typedef struct xfs_attr_leaf_hdr xfs_attr_leaf_hdr_t; +typedef struct xfs_attr_leaf_map xfs_attr_leaf_map_t; +typedef struct xfs_attr_leaf_entry xfs_attr_leaf_entry_t; +typedef struct xfs_attr_leaf_name_local xfs_attr_leaf_name_local_t; +typedef struct xfs_attr_leaf_name_remote xfs_attr_leaf_name_remote_t; + +/* + * Flags used in the leaf_entry[i].flags field. + * NOTE: the INCOMPLETE bit must not collide with the flags bits specified + * on the system call, they are "or"ed together for various operations. + */ +#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */ +#define XFS_ATTR_ROOT_BIT 1 /* limit access to attr to userid 0 */ +#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */ +#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT) +#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT) +#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT) + +/* + * Alignment for namelist and valuelist entries (since they are mixed + * there can be only one alignment value) + */ +#define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t)) + +/* + * Cast typed pointers for "local" and "remote" name/value structs. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_NAME_REMOTE) +xfs_attr_leaf_name_remote_t * +xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx); +#define XFS_ATTR_LEAF_NAME_REMOTE(leafp,idx) \ + xfs_attr_leaf_name_remote(leafp,idx) +#else +#define XFS_ATTR_LEAF_NAME_REMOTE(leafp,idx) /* remote name struct ptr */ \ + ((xfs_attr_leaf_name_remote_t *) \ + &((char *)(leafp))[ INT_GET((leafp)->entries[idx].nameidx, ARCH_CONVERT) ]) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_NAME_LOCAL) +xfs_attr_leaf_name_local_t * +xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx); +#define XFS_ATTR_LEAF_NAME_LOCAL(leafp,idx) \ + xfs_attr_leaf_name_local(leafp,idx) +#else +#define XFS_ATTR_LEAF_NAME_LOCAL(leafp,idx) /* local name struct ptr */ \ + ((xfs_attr_leaf_name_local_t *) \ + &((char *)(leafp))[ INT_GET((leafp)->entries[idx].nameidx, ARCH_CONVERT) ]) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_NAME) +char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx); +#define XFS_ATTR_LEAF_NAME(leafp,idx) xfs_attr_leaf_name(leafp,idx) +#else +#define XFS_ATTR_LEAF_NAME(leafp,idx) /* generic name struct ptr */ \ + (&((char *)(leafp))[ INT_GET((leafp)->entries[idx].nameidx, ARCH_CONVERT) ]) +#endif + +/* + * Calculate total bytes used (including trailing pad for alignment) for + * a "local" name/value structure, a "remote" name/value structure, and + * a pointer which might be either. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_ENTSIZE_REMOTE) +int xfs_attr_leaf_entsize_remote(int nlen); +#define XFS_ATTR_LEAF_ENTSIZE_REMOTE(nlen) \ + xfs_attr_leaf_entsize_remote(nlen) +#else +#define XFS_ATTR_LEAF_ENTSIZE_REMOTE(nlen) /* space for remote struct */ \ + (((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \ + XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_ENTSIZE_LOCAL) +int xfs_attr_leaf_entsize_local(int nlen, int vlen); +#define XFS_ATTR_LEAF_ENTSIZE_LOCAL(nlen,vlen) \ + xfs_attr_leaf_entsize_local(nlen,vlen) +#else +#define XFS_ATTR_LEAF_ENTSIZE_LOCAL(nlen,vlen) /* space for local struct */ \ + (((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) + \ + XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX) +int xfs_attr_leaf_entsize_local_max(int bsize); +#define XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(bsize) \ + xfs_attr_leaf_entsize_local_max(bsize) +#else +#define XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(bsize) /* max local struct size */ \ + (((bsize) >> 1) + ((bsize) >> 2)) +#endif + + +/*======================================================================== + * Structure used to pass context around among the routines. + *========================================================================*/ + +typedef struct xfs_attr_list_context { + struct xfs_inode *dp; /* inode */ + struct attrlist_cursor_kern *cursor;/* position in list */ + struct attrlist *alist; /* output buffer */ + int count; /* num used entries */ + int dupcnt; /* count dup hashvals seen */ + int bufsize;/* total buffer size */ + int firstu; /* first used byte in buffer */ + int flags; /* from VOP call */ + int resynch;/* T/F: resynch with cursor */ +} xfs_attr_list_context_t; + +/* + * Used to keep a list of "remote value" extents when unlinking an inode. + */ +typedef struct xfs_attr_inactive_list { + xfs_dablk_t valueblk; /* block number of value bytes */ + int valuelen; /* number of bytes in value */ +} xfs_attr_inactive_list_t; + + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Internal routines when dirsize < XFS_LITINO(mp). + */ +int xfs_attr_shortform_create(struct xfs_da_args *args); +int xfs_attr_shortform_add(struct xfs_da_args *add); +int xfs_attr_shortform_lookup(struct xfs_da_args *args); +int xfs_attr_shortform_getvalue(struct xfs_da_args *args); +int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); +int xfs_attr_shortform_remove(struct xfs_da_args *remove); +int xfs_attr_shortform_list(struct xfs_attr_list_context *context); +int xfs_attr_shortform_replace(struct xfs_da_args *args); +int xfs_attr_shortform_allfit(struct xfs_dabuf *bp, struct xfs_inode *dp); + +/* + * Internal routines when dirsize == XFS_LBSIZE(mp). + */ +int xfs_attr_leaf_to_node(struct xfs_da_args *args); +int xfs_attr_leaf_to_shortform(struct xfs_dabuf *bp, + struct xfs_da_args *args); +int xfs_attr_leaf_clearflag(struct xfs_da_args *args); +int xfs_attr_leaf_setflag(struct xfs_da_args *args); +int xfs_attr_leaf_flipflags(xfs_da_args_t *args); + +/* + * Routines used for growing the Btree. + */ +int xfs_attr_leaf_create(struct xfs_da_args *args, xfs_dablk_t which_block, + struct xfs_dabuf **bpp); +int xfs_attr_leaf_split(struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk); +int xfs_attr_leaf_lookup_int(struct xfs_dabuf *leaf, + struct xfs_da_args *args); +int xfs_attr_leaf_getvalue(struct xfs_dabuf *bp, struct xfs_da_args *args); +int xfs_attr_leaf_add(struct xfs_dabuf *leaf_buffer, + struct xfs_da_args *args); +int xfs_attr_leaf_remove(struct xfs_dabuf *leaf_buffer, + struct xfs_da_args *args); +int xfs_attr_leaf_list_int(struct xfs_dabuf *bp, + struct xfs_attr_list_context *context); + +/* + * Routines used for shrinking the Btree. + */ +int xfs_attr_leaf_toosmall(struct xfs_da_state *state, int *retval); +void xfs_attr_leaf_unbalance(struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk); +int xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); +int xfs_attr_node_inactive(struct xfs_trans **trans, struct xfs_inode *dp, + struct xfs_dabuf *bp, int level); +int xfs_attr_leaf_inactive(struct xfs_trans **trans, struct xfs_inode *dp, + struct xfs_dabuf *bp); +int xfs_attr_leaf_freextent(struct xfs_trans **trans, struct xfs_inode *dp, + xfs_dablk_t blkno, int blkcnt); + +/* + * Utility routines. + */ +xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_dabuf *bp, int *count); +int xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp, + struct xfs_dabuf *leaf2_bp); +int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int blocksize, + int *local); +int xfs_attr_leaf_entsize(struct xfs_attr_leafblock *leaf, int index); +int xfs_attr_put_listent(struct xfs_attr_list_context *context, + int ns, char *name, int namelen, int valuelen); +int xfs_attr_rolltrans(struct xfs_trans **transp, struct xfs_inode *dp); + +#endif /* __XFS_ATTR_LEAF_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_attr_sf.h linux-2.4-xfs/fs/xfs/xfs_attr_sf.h --- linux-2.4.19/fs/xfs/xfs_attr_sf.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_attr_sf.h Wed Jul 10 23:13:51 2002 @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2000, 2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_ATTR_SF_H__ +#define __XFS_ATTR_SF_H__ + +/* + * Attribute storage when stored inside the inode. + * + * Small attribute lists are packed as tightly as possible so as + * to fit into the literal area of the inode. + */ + +struct xfs_inode; + +/* + * Entries are packed toward the top as tight as possible. + */ +typedef struct xfs_attr_shortform { + struct xfs_attr_sf_hdr { /* constant-structure header block */ + __uint16_t totsize; /* total bytes in shortform list */ + __uint8_t count; /* count of active entries */ + } hdr; + struct xfs_attr_sf_entry { + __uint8_t namelen; /* actual length of name (no NULL) */ + __uint8_t valuelen; /* actual length of value (no NULL) */ + __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ + __uint8_t nameval[1]; /* name & value bytes concatenated */ + } list[1]; /* variable sized array */ +} xfs_attr_shortform_t; +typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t; +typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t; + +/* + * We generate this then sort it, attr_list() must return things in hash-order. + */ +typedef struct xfs_attr_sf_sort { + __uint8_t entno; /* entry number in original list */ + __uint8_t namelen; /* length of name value (no null) */ + __uint8_t valuelen; /* length of value */ + __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ + xfs_dahash_t hash; /* this entry's hash value */ + char *name; /* name value, pointer into buffer */ +} xfs_attr_sf_sort_t; + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_SF_ENTSIZE_BYNAME) +int xfs_attr_sf_entsize_byname(int nlen, int vlen); +#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) \ + xfs_attr_sf_entsize_byname(nlen,vlen) +#else +#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \ + ((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen)) +#endif +#define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \ + ((1 << (NBBY*(int)sizeof(__uint8_t))) - 1) +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_SF_ENTSIZE) +int xfs_attr_sf_entsize(xfs_attr_sf_entry_t *sfep); +#define XFS_ATTR_SF_ENTSIZE(sfep) xfs_attr_sf_entsize(sfep) +#else +#define XFS_ATTR_SF_ENTSIZE(sfep) /* space an entry uses */ \ + ((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_SF_NEXTENTRY) +xfs_attr_sf_entry_t *xfs_attr_sf_nextentry(xfs_attr_sf_entry_t *sfep); +#define XFS_ATTR_SF_NEXTENTRY(sfep) xfs_attr_sf_nextentry(sfep) +#else +#define XFS_ATTR_SF_NEXTENTRY(sfep) /* next entry in struct */ \ + ((xfs_attr_sf_entry_t *) \ + ((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep))) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_SF_TOTSIZE) +int xfs_attr_sf_totsize(struct xfs_inode *dp); +#define XFS_ATTR_SF_TOTSIZE(dp) xfs_attr_sf_totsize(dp) +#else +#define XFS_ATTR_SF_TOTSIZE(dp) /* total space in use */ \ + (INT_GET(((xfs_attr_shortform_t *)((dp)->i_afp->if_u1.if_data))->hdr.totsize, ARCH_CONVERT)) +#endif + +#ifdef XFS_ALL_TRACE +#define XFS_ATTR_TRACE +#endif + +#if !defined(DEBUG) +#undef XFS_ATTR_TRACE +#endif + +/* + * Kernel tracing support for attribute lists + */ +struct xfs_attr_list_context; +struct xfs_da_intnode; +struct xfs_da_node_entry; +struct xfs_attr_leafblock; + +#define XFS_ATTR_TRACE_SIZE 4096 /* size of global trace buffer */ + +/* + * Trace record types. + */ +#define XFS_ATTR_KTRACE_L_C 1 /* context */ +#define XFS_ATTR_KTRACE_L_CN 2 /* context, node */ +#define XFS_ATTR_KTRACE_L_CB 3 /* context, btree */ +#define XFS_ATTR_KTRACE_L_CL 4 /* context, leaf */ + +#if defined(XFS_ATTR_TRACE) + +void xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context); +void xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context, + struct xfs_da_intnode *node); +void xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context, + struct xfs_da_node_entry *btree); +void xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context, + struct xfs_attr_leafblock *leaf); +void xfs_attr_trace_enter(int type, char *where, + __psunsigned_t a2, __psunsigned_t a3, + __psunsigned_t a4, __psunsigned_t a5, + __psunsigned_t a6, __psunsigned_t a7, + __psunsigned_t a8, __psunsigned_t a9, + __psunsigned_t a10, __psunsigned_t a11, + __psunsigned_t a12, __psunsigned_t a13, + __psunsigned_t a14, __psunsigned_t a15); +#else +#define xfs_attr_trace_l_c(w,c) +#define xfs_attr_trace_l_cn(w,c,n) +#define xfs_attr_trace_l_cb(w,c,b) +#define xfs_attr_trace_l_cl(w,c,l) +#endif /* XFS_ATTR_TRACE */ + +#endif /* __XFS_ATTR_SF_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_bit.c linux-2.4-xfs/fs/xfs/xfs_bit.c --- linux-2.4.19/fs/xfs/xfs_bit.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_bit.c Wed Aug 21 01:24:17 2002 @@ -0,0 +1,305 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * XFS bit manipulation routines, used in non-realtime code. + */ + +#include + +#ifndef HAVE_ARCH_HIGHBIT +/* + * Index of high bit number in byte, -1 for none set, 0..7 otherwise. + */ +const char xfs_highbit[256] = { + -1, 0, 1, 1, 2, 2, 2, 2, /* 00 .. 07 */ + 3, 3, 3, 3, 3, 3, 3, 3, /* 08 .. 0f */ + 4, 4, 4, 4, 4, 4, 4, 4, /* 10 .. 17 */ + 4, 4, 4, 4, 4, 4, 4, 4, /* 18 .. 1f */ + 5, 5, 5, 5, 5, 5, 5, 5, /* 20 .. 27 */ + 5, 5, 5, 5, 5, 5, 5, 5, /* 28 .. 2f */ + 5, 5, 5, 5, 5, 5, 5, 5, /* 30 .. 37 */ + 5, 5, 5, 5, 5, 5, 5, 5, /* 38 .. 3f */ + 6, 6, 6, 6, 6, 6, 6, 6, /* 40 .. 47 */ + 6, 6, 6, 6, 6, 6, 6, 6, /* 48 .. 4f */ + 6, 6, 6, 6, 6, 6, 6, 6, /* 50 .. 57 */ + 6, 6, 6, 6, 6, 6, 6, 6, /* 58 .. 5f */ + 6, 6, 6, 6, 6, 6, 6, 6, /* 60 .. 67 */ + 6, 6, 6, 6, 6, 6, 6, 6, /* 68 .. 6f */ + 6, 6, 6, 6, 6, 6, 6, 6, /* 70 .. 77 */ + 6, 6, 6, 6, 6, 6, 6, 6, /* 78 .. 7f */ + 7, 7, 7, 7, 7, 7, 7, 7, /* 80 .. 87 */ + 7, 7, 7, 7, 7, 7, 7, 7, /* 88 .. 8f */ + 7, 7, 7, 7, 7, 7, 7, 7, /* 90 .. 97 */ + 7, 7, 7, 7, 7, 7, 7, 7, /* 98 .. 9f */ + 7, 7, 7, 7, 7, 7, 7, 7, /* a0 .. a7 */ + 7, 7, 7, 7, 7, 7, 7, 7, /* a8 .. af */ + 7, 7, 7, 7, 7, 7, 7, 7, /* b0 .. b7 */ + 7, 7, 7, 7, 7, 7, 7, 7, /* b8 .. bf */ + 7, 7, 7, 7, 7, 7, 7, 7, /* c0 .. c7 */ + 7, 7, 7, 7, 7, 7, 7, 7, /* c8 .. cf */ + 7, 7, 7, 7, 7, 7, 7, 7, /* d0 .. d7 */ + 7, 7, 7, 7, 7, 7, 7, 7, /* d8 .. df */ + 7, 7, 7, 7, 7, 7, 7, 7, /* e0 .. e7 */ + 7, 7, 7, 7, 7, 7, 7, 7, /* e8 .. ef */ + 7, 7, 7, 7, 7, 7, 7, 7, /* f0 .. f7 */ + 7, 7, 7, 7, 7, 7, 7, 7, /* f8 .. ff */ +}; +#endif + +/* + * Count of bits set in byte, 0..8. + */ +static const char xfs_countbit[256] = { + 0, 1, 1, 2, 1, 2, 2, 3, /* 00 .. 07 */ + 1, 2, 2, 3, 2, 3, 3, 4, /* 08 .. 0f */ + 1, 2, 2, 3, 2, 3, 3, 4, /* 10 .. 17 */ + 2, 3, 3, 4, 3, 4, 4, 5, /* 18 .. 1f */ + 1, 2, 2, 3, 2, 3, 3, 4, /* 20 .. 27 */ + 2, 3, 3, 4, 3, 4, 4, 5, /* 28 .. 2f */ + 2, 3, 3, 4, 3, 4, 4, 5, /* 30 .. 37 */ + 3, 4, 4, 5, 4, 5, 5, 6, /* 38 .. 3f */ + 1, 2, 2, 3, 2, 3, 3, 4, /* 40 .. 47 */ + 2, 3, 3, 4, 3, 4, 4, 5, /* 48 .. 4f */ + 2, 3, 3, 4, 3, 4, 4, 5, /* 50 .. 57 */ + 3, 4, 4, 5, 4, 5, 5, 6, /* 58 .. 5f */ + 2, 3, 3, 4, 3, 4, 4, 5, /* 60 .. 67 */ + 3, 4, 4, 5, 4, 5, 5, 6, /* 68 .. 6f */ + 3, 4, 4, 5, 4, 5, 5, 6, /* 70 .. 77 */ + 4, 5, 5, 6, 5, 6, 6, 7, /* 78 .. 7f */ + 1, 2, 2, 3, 2, 3, 3, 4, /* 80 .. 87 */ + 2, 3, 3, 4, 3, 4, 4, 5, /* 88 .. 8f */ + 2, 3, 3, 4, 3, 4, 4, 5, /* 90 .. 97 */ + 3, 4, 4, 5, 4, 5, 5, 6, /* 98 .. 9f */ + 2, 3, 3, 4, 3, 4, 4, 5, /* a0 .. a7 */ + 3, 4, 4, 5, 4, 5, 5, 6, /* a8 .. af */ + 3, 4, 4, 5, 4, 5, 5, 6, /* b0 .. b7 */ + 4, 5, 5, 6, 5, 6, 6, 7, /* b8 .. bf */ + 2, 3, 3, 4, 3, 4, 4, 5, /* c0 .. c7 */ + 3, 4, 4, 5, 4, 5, 5, 6, /* c8 .. cf */ + 3, 4, 4, 5, 4, 5, 5, 6, /* d0 .. d7 */ + 4, 5, 5, 6, 5, 6, 6, 7, /* d8 .. df */ + 3, 4, 4, 5, 4, 5, 5, 6, /* e0 .. e7 */ + 4, 5, 5, 6, 5, 6, 6, 7, /* e8 .. ef */ + 4, 5, 5, 6, 5, 6, 6, 7, /* f0 .. f7 */ + 5, 6, 6, 7, 6, 7, 7, 8, /* f8 .. ff */ +}; + +/* + * xfs_highbit32: get high bit set out of 32-bit argument, -1 if none set. + */ +int inline +xfs_highbit32( + __uint32_t v) +{ +#ifdef HAVE_ARCH_HIGHBIT + return highbit32(v); +#else + int i; + + if (v & 0xffff0000) + if (v & 0xff000000) + i = 24; + else + i = 16; + else if (v & 0x0000ffff) + if (v & 0x0000ff00) + i = 8; + else + i = 0; + else + return -1; + return i + xfs_highbit[(v >> i) & 0xff]; +#endif +} + +/* + * xfs_lowbit64: get low bit set out of 64-bit argument, -1 if none set. + */ +int +xfs_lowbit64( + __uint64_t v) +{ + int n; + n = ffs((unsigned)v); + if (n == 0) { + n = ffs(v >> 32); + if (n >= 0) + n+=32; + } + return n-1; +} + +/* + * xfs_highbit64: get high bit set out of 64-bit argument, -1 if none set. + */ +int +xfs_highbit64( + __uint64_t v) +{ + __uint32_t h = v >> 32; + if (h) + return xfs_highbit32(h) + 32; + return xfs_highbit32((__u32)v); +} + + +/* + * Count the number of bits set in the bitmap starting with bit + * start_bit. Size is the size of the bitmap in words. + * + * Do the counting by mapping a byte value to the number of set + * bits for that value using the xfs_countbit array, i.e. + * xfs_countbit[0] == 0, xfs_countbit[1] == 1, xfs_countbit[2] == 1, + * xfs_countbit[3] == 2, etc. + */ +int +xfs_count_bits(uint *map, uint size, uint start_bit) +{ + register int bits; + register unsigned char *bytep; + register unsigned char *end_map; + int byte_bit; + + bits = 0; + end_map = (char*)(map + size); + bytep = (char*)(map + (start_bit & ~0x7)); + byte_bit = start_bit & 0x7; + + /* + * If the caller fell off the end of the map, return 0. + */ + if (bytep >= end_map) { + return (0); + } + + /* + * If start_bit is not byte aligned, then process the + * first byte separately. + */ + if (byte_bit != 0) { + /* + * Shift off the bits we don't want to look at, + * before indexing into xfs_countbit. + */ + bits += xfs_countbit[(*bytep >> byte_bit)]; + bytep++; + } + + /* + * Count the bits in each byte until the end of the bitmap. + */ + while (bytep < end_map) { + bits += xfs_countbit[*bytep]; + bytep++; + } + + return (bits); +} + +/* + * Count the number of contiguous bits set in the bitmap starting with bit + * start_bit. Size is the size of the bitmap in words. + */ +int +xfs_contig_bits(uint *map, uint size, uint start_bit) +{ +#if BITS_PER_LONG == 32 + return find_next_zero_bit(map,size*sizeof(uint)*8,start_bit) - start_bit; +#else + /* + * The first argument to find_next_zero_bit needs to be aligned, + * but this is coming from the xfs_buf_log_format_t on-disk + * struct, which can't be padded or otherwise modified w/o breaking + * on-disk compatibility... so create a temporary, aligned + * variable, copy over the bitmap, and send that to find_next_zero_bit + * This only happens in recovery, so it's ugly but not too bad. + */ + void * addr; + int bit; + size_t bitmap_size = size * sizeof(uint); + + addr = (void *)kmem_alloc(bitmap_size, KM_SLEEP); + memcpy(addr, map, size * sizeof(uint)); + + bit = find_next_zero_bit(addr,size*sizeof(uint)*8,start_bit) - start_bit; + + kmem_free(addr, bitmap_size); + + return bit; +#endif +} + +/* + * This takes the bit number to start looking from and + * returns the next set bit from there. It returns -1 + * if there are no more bits set or the start bit is + * beyond the end of the bitmap. + * + * Size is the number of words, not bytes, in the bitmap. + */ +int xfs_next_bit(uint *map, uint size, uint start_bit) +{ + uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT); + uint result = start_bit & ~(NBWORD - 1); + uint tmp; + + size <<= BIT_TO_WORD_SHIFT; + + if (start_bit >= size) + return -1; + size -= result; + start_bit &= (NBWORD - 1); + if (start_bit) { + tmp = *p++; + /* set to zero first offset bits */ + tmp &= (~0U << start_bit); + if (size < NBWORD) + goto found_first; + if (tmp != 0U) + goto found_middle; + size -= NBWORD; + result += NBWORD; + } + while (size >= NBWORD) { + if ((tmp = *p++) != 0U) + goto found_middle; + result += NBWORD; + size -= NBWORD; + } + if (!size) + return -1; + tmp = *p; +found_first: +found_middle: + return result + ffs(tmp) - 1; +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_bit.h linux-2.4-xfs/fs/xfs/xfs_bit.h --- linux-2.4.19/fs/xfs/xfs_bit.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_bit.h Wed Jul 10 23:13:51 2002 @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2000, 2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_BIT_H__ +#define __XFS_BIT_H__ + +/* + * XFS bit manipulation routines. + */ + +/* + * masks with n high/low bits set, 32-bit values & 64-bit values + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MASK32HI) +__uint32_t xfs_mask32hi(int n); +#define XFS_MASK32HI(n) xfs_mask32hi(n) +#else +#define XFS_MASK32HI(n) ((__uint32_t)-1 << (32 - (n))) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MASK64HI) +__uint64_t xfs_mask64hi(int n); +#define XFS_MASK64HI(n) xfs_mask64hi(n) +#else +#define XFS_MASK64HI(n) ((__uint64_t)-1 << (64 - (n))) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MASK32LO) +__uint32_t xfs_mask32lo(int n); +#define XFS_MASK32LO(n) xfs_mask32lo(n) +#else +#define XFS_MASK32LO(n) (((__uint32_t)1 << (n)) - 1) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MASK64LO) +__uint64_t xfs_mask64lo(int n); +#define XFS_MASK64LO(n) xfs_mask64lo(n) +#else +#define XFS_MASK64LO(n) (((__uint64_t)1 << (n)) - 1) +#endif + +/* Get high bit set out of 32-bit argument, -1 if none set */ +extern int xfs_highbit32(__uint32_t v); + +/* Get low bit set out of 64-bit argument, -1 if none set */ +extern int xfs_lowbit64(__uint64_t v); + +/* Get high bit set out of 64-bit argument, -1 if none set */ +extern int xfs_highbit64(__uint64_t); + +/* Count set bits in map starting with start_bit */ +extern int xfs_count_bits(uint *map, uint size, uint start_bit); + +/* Count continuous one bits in map starting with start_bit */ +extern int xfs_contig_bits(uint *map, uint size, uint start_bit); + +/* Find next set bit in map */ +extern int xfs_next_bit(uint *map, uint size, uint start_bit); + +#endif /* __XFS_BIT_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_bmap.c linux-2.4-xfs/fs/xfs/xfs_bmap.c --- linux-2.4.19/fs/xfs/xfs_bmap.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_bmap.c Thu Sep 5 15:35:08 2002 @@ -0,0 +1,6323 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include + +#ifdef DEBUG +ktrace_t *xfs_bmap_trace_buf; +#endif + +#ifdef XFSDEBUG +STATIC void +xfs_bmap_check_leaf_extents(xfs_btree_cur_t *cur, xfs_inode_t *ip, int whichfork); +#endif + +kmem_zone_t *xfs_bmap_free_item_zone; + +/* + * Prototypes for internal bmap routines. + */ + + +/* + * Called from xfs_bmap_add_attrfork to handle extents format files. + */ +STATIC int /* error */ +xfs_bmap_add_attrfork_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated */ + xfs_bmap_free_t *flist, /* blocks to free at commit */ + int *flags); /* inode logging flags */ + +/* + * Called from xfs_bmap_add_attrfork to handle local format files. + */ +STATIC int /* error */ +xfs_bmap_add_attrfork_local( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated */ + xfs_bmap_free_t *flist, /* blocks to free at commit */ + int *flags); /* inode logging flags */ + +/* + * Called by xfs_bmapi to update extent list structure and the btree + * after allocating space (or doing a delayed allocation). + */ +STATIC int /* error */ +xfs_bmap_add_extent( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* extent number to update/insert */ + xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to put in extent list */ + xfs_fsblock_t *first, /* pointer to firstblock variable */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + int *logflagsp, /* inode logging flags */ + int whichfork, /* data or attr fork */ + int rsvd); /* OK to allocate reserved blocks */ + +/* + * Called by xfs_bmap_add_extent to handle cases converting a delayed + * allocation to a real allocation. + */ +STATIC int /* error */ +xfs_bmap_add_extent_delay_real( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* extent number to update/insert */ + xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to put in extent list */ + xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ + xfs_fsblock_t *first, /* pointer to firstblock variable */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + int *logflagsp, /* inode logging flags */ + int rsvd); /* OK to allocate reserved blocks */ + +/* + * Called by xfs_bmap_add_extent to handle cases converting a hole + * to a delayed allocation. + */ +STATIC int /* error */ +xfs_bmap_add_extent_hole_delay( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* extent number to update/insert */ + xfs_btree_cur_t *cur, /* if null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to put in extent list */ + int *logflagsp,/* inode logging flags */ + int rsvd); /* OK to allocate reserved blocks */ + +/* + * Called by xfs_bmap_add_extent to handle cases converting a hole + * to a real allocation. + */ +STATIC int /* error */ +xfs_bmap_add_extent_hole_real( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* extent number to update/insert */ + xfs_btree_cur_t *cur, /* if null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to put in extent list */ + int *logflagsp, /* inode logging flags */ + int whichfork); /* data or attr fork */ + +/* + * Called by xfs_bmap_add_extent to handle cases converting an unwritten + * allocation to a real allocation or vice versa. + */ +STATIC int /* error */ +xfs_bmap_add_extent_unwritten_real( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* extent number to update/insert */ + xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to put in extent list */ + int *logflagsp); /* inode logging flags */ + +/* + * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. + * It figures out where to ask the underlying allocator to put the new extent. + */ +STATIC int /* error */ +xfs_bmap_alloc( + xfs_bmalloca_t *ap); /* bmap alloc argument struct */ + +/* + * Transform a btree format file with only one leaf node, where the + * extents list will fit in the inode, into an extents format file. + * Since the extent list is already in-core, all we have to do is + * give up the space for the btree root and pitch the leaf block. + */ +STATIC int /* error */ +xfs_bmap_btree_to_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_btree_cur_t *cur, /* btree cursor */ + int *logflagsp, /* inode logging flags */ + int whichfork, /* data or attr fork */ + int async); /* xaction can be async */ + +#ifdef XFSDEBUG +/* + * Check that the extents list for the inode ip is in the right order. + */ +STATIC void +xfs_bmap_check_extents( + xfs_inode_t *ip, /* incore inode pointer */ + int whichfork); /* data or attr fork */ +#else +#define xfs_bmap_check_extents(ip,w) +#endif + +/* + * Called by xfs_bmapi to update extent list structure and the btree + * after removing space (or undoing a delayed allocation). + */ +STATIC int /* error */ +xfs_bmap_del_extent( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_trans_t *tp, /* current trans pointer */ + xfs_extnum_t idx, /* extent number to update/insert */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + xfs_btree_cur_t *cur, /* if null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to put in extent list */ + int iflags, /* input flags (meta-data or not) */ + int *logflagsp,/* inode logging flags */ + int whichfork, /* data or attr fork */ + int rsvd); /* OK to allocate reserved blocks */ + +/* + * Remove the entry "free" from the free item list. Prev points to the + * previous entry, unless "free" is the head of the list. + */ +STATIC void +xfs_bmap_del_free( + xfs_bmap_free_t *flist, /* free item list header */ + xfs_bmap_free_item_t *prev, /* previous item on list, if any */ + xfs_bmap_free_item_t *free); /* list item to be freed */ + +/* + * Remove count entries from the extents array for inode "ip", starting + * at index "idx". Copies the remaining items down over the deleted ones, + * and gives back the excess memory. + */ +STATIC void +xfs_bmap_delete_exlist( + xfs_inode_t *ip, /* incode inode pointer */ + xfs_extnum_t idx, /* starting delete index */ + xfs_extnum_t count, /* count of items to delete */ + int whichfork); /* data or attr fork */ + +/* + * Convert an extents-format file into a btree-format file. + * The new file will have a root block (in the inode) and a single child block. + */ +STATIC int /* error */ +xfs_bmap_extents_to_btree( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first-block-allocated */ + xfs_bmap_free_t *flist, /* blocks freed in xaction */ + xfs_btree_cur_t **curp, /* cursor returned to caller */ + int wasdel, /* converting a delayed alloc */ + int *logflagsp, /* inode logging flags */ + int whichfork); /* data or attr fork */ + +/* + * Insert new item(s) in the extent list for inode "ip". + * Count new items are inserted at offset idx. + */ +STATIC void +xfs_bmap_insert_exlist( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* starting index of new items */ + xfs_extnum_t count, /* number of inserted items */ + xfs_bmbt_irec_t *new, /* items to insert */ + int whichfork); /* data or attr fork */ + +/* + * Convert a local file to an extents file. + * This code is sort of bogus, since the file data needs to get + * logged so it won't be lost. The bmap-level manipulations are ok, though. + */ +STATIC int /* error */ +xfs_bmap_local_to_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated in xaction */ + xfs_extlen_t total, /* total blocks needed by transaction */ + int *logflagsp, /* inode logging flags */ + int whichfork); /* data or attr fork */ + +/* + * Search the extents list for the inode, for the extent containing bno. + * If bno lies in a hole, point to the next entry. If bno lies past eof, + * *eofp will be set, and *prevp will contain the last entry (null if none). + * Else, *lastxp will be set to the index of the found + * entry; *gotp will contain the entry. + */ +STATIC xfs_bmbt_rec_t * /* pointer to found extent entry */ +xfs_bmap_search_extents( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fileoff_t bno, /* block number searched for */ + int whichfork, /* data or attr fork */ + int *eofp, /* out: end of file found */ + xfs_extnum_t *lastxp, /* out: last extent index */ + xfs_bmbt_irec_t *gotp, /* out: extent entry found */ + xfs_bmbt_irec_t *prevp); /* out: previous extent entry found */ + +#ifdef XFS_BMAP_TRACE +/* + * Add a bmap trace buffer entry. Base routine for the others. + */ +STATIC void +xfs_bmap_trace_addentry( + int opcode, /* operation */ + char *fname, /* function name */ + char *desc, /* operation description */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index of entry(ies) */ + xfs_extnum_t cnt, /* count of entries, 1 or 2 */ + xfs_bmbt_rec_t *r1, /* first record */ + xfs_bmbt_rec_t *r2, /* second record or null */ + int whichfork); /* data or attr fork */ + +/* + * Add bmap trace entry prior to a call to xfs_bmap_delete_exlist. + */ +STATIC void +xfs_bmap_trace_delete( + char *fname, /* function name */ + char *desc, /* operation description */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index of entry(entries) deleted */ + xfs_extnum_t cnt, /* count of entries deleted, 1 or 2 */ + int whichfork); /* data or attr fork */ + +/* + * Add bmap trace entry prior to a call to xfs_bmap_insert_exlist, or + * reading in the extents list from the disk (in the btree). + */ +STATIC void +xfs_bmap_trace_insert( + char *fname, /* function name */ + char *desc, /* operation description */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index of entry(entries) inserted */ + xfs_extnum_t cnt, /* count of entries inserted, 1 or 2 */ + xfs_bmbt_irec_t *r1, /* inserted record 1 */ + xfs_bmbt_irec_t *r2, /* inserted record 2 or null */ + int whichfork); /* data or attr fork */ + +/* + * Add bmap trace entry after updating an extent list entry in place. + */ +STATIC void +xfs_bmap_trace_post_update( + char *fname, /* function name */ + char *desc, /* operation description */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index of entry updated */ + int whichfork); /* data or attr fork */ + +/* + * Add bmap trace entry prior to updating an extent list entry in place. + */ +STATIC void +xfs_bmap_trace_pre_update( + char *fname, /* function name */ + char *desc, /* operation description */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index of entry to be updated */ + int whichfork); /* data or attr fork */ + +#else +#define xfs_bmap_trace_delete(f,d,ip,i,c,w) +#define xfs_bmap_trace_insert(f,d,ip,i,c,r1,r2,w) +#define xfs_bmap_trace_post_update(f,d,ip,i,w) +#define xfs_bmap_trace_pre_update(f,d,ip,i,w) +#endif /* XFS_BMAP_TRACE */ + +/* + * Compute the worst-case number of indirect blocks that will be used + * for ip's delayed extent of length "len". + */ +STATIC xfs_filblks_t +xfs_bmap_worst_indlen( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_filblks_t len); /* delayed extent length */ + +#ifdef DEBUG +/* + * Perform various validation checks on the values being returned + * from xfs_bmapi(). + */ +STATIC void +xfs_bmap_validate_ret( + xfs_fileoff_t bno, + xfs_filblks_t len, + int flags, + xfs_bmbt_irec_t *mval, + int nmap, + int ret_nmap); +#else +#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) +#endif /* DEBUG */ + +#if defined(DEBUG) && defined(XFS_RW_TRACE) +STATIC void +xfs_bunmap_trace( + xfs_inode_t *ip, + xfs_fileoff_t bno, + xfs_filblks_t len, + int flags, + inst_t *ra); +#else +#define xfs_bunmap_trace(ip, bno, len, flags, ra) +#endif /* DEBUG && XFS_RW_TRACE */ + +STATIC int +xfs_bmap_count_tree( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_fsblock_t blockno, + int levelin, + int *count); + +STATIC int +xfs_bmap_count_leaves( + xfs_bmbt_rec_t *frp, + int numrecs, + int *count); + +/* + * Bmap internal routines. + */ + +/* + * Called from xfs_bmap_add_attrfork to handle btree format files. + */ +STATIC int /* error */ +xfs_bmap_add_attrfork_btree( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated */ + xfs_bmap_free_t *flist, /* blocks to free at commit */ + int *flags) /* inode logging flags */ +{ + xfs_btree_cur_t *cur; /* btree cursor */ + int error; /* error return value */ + xfs_mount_t *mp; /* file system mount struct */ + int stat; /* newroot status */ + + mp = ip->i_mount; + if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip)) + *flags |= XFS_ILOG_DBROOT; + else { + cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip, + XFS_DATA_FORK); + cur->bc_private.b.flist = flist; + cur->bc_private.b.firstblock = *firstblock; + if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat))) + goto error0; + ASSERT(stat == 1); /* must be at least one entry */ + if ((error = xfs_bmbt_newroot(cur, flags, &stat))) + goto error0; + if (stat == 0) { + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return XFS_ERROR(ENOSPC); + } + *firstblock = cur->bc_private.b.firstblock; + cur->bc_private.b.allocated = 0; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + } + return 0; +error0: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* + * Called from xfs_bmap_add_attrfork to handle extents format files. + */ +STATIC int /* error */ +xfs_bmap_add_attrfork_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated */ + xfs_bmap_free_t *flist, /* blocks to free at commit */ + int *flags) /* inode logging flags */ +{ + xfs_btree_cur_t *cur; /* bmap btree cursor */ + int error; /* error return value */ + + if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip)) + return 0; + cur = NULL; + error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0, + flags, XFS_DATA_FORK); + if (cur) { + cur->bc_private.b.allocated = 0; + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + return error; +} + +/* + * Called from xfs_bmap_add_attrfork to handle local format files. + */ +STATIC int /* error */ +xfs_bmap_add_attrfork_local( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated */ + xfs_bmap_free_t *flist, /* blocks to free at commit */ + int *flags) /* inode logging flags */ +{ + xfs_da_args_t dargs; /* args for dir/attr code */ + int error; /* error return value */ + xfs_mount_t *mp; /* mount structure pointer */ + + if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip)) + return 0; + if ((ip->i_d.di_mode & IFMT) == IFDIR) { + mp = ip->i_mount; + bzero(&dargs, sizeof(dargs)); + dargs.dp = ip; + dargs.firstblock = firstblock; + dargs.flist = flist; + dargs.total = mp->m_dirblkfsbs; + dargs.whichfork = XFS_DATA_FORK; + dargs.trans = tp; + error = XFS_DIR_SHORTFORM_TO_SINGLE(mp, &dargs); + } else + error = xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags, + XFS_DATA_FORK); + return error; +} + +/* + * Called by xfs_bmapi to update extent list structure and the btree + * after allocating space (or doing a delayed allocation). + */ +STATIC int /* error */ +xfs_bmap_add_extent( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* extent number to update/insert */ + xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to put in extent list */ + xfs_fsblock_t *first, /* pointer to firstblock variable */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + int *logflagsp, /* inode logging flags */ + int whichfork, /* data or attr fork */ + int rsvd) /* OK to use reserved data blocks */ +{ + xfs_btree_cur_t *cur; /* btree cursor or null */ + xfs_filblks_t da_new; /* new count del alloc blocks used */ + xfs_filblks_t da_old; /* old count del alloc blocks used */ + int error; /* error return value */ +#ifdef XFS_BMAP_TRACE + static char fname[] = "xfs_bmap_add_extent"; +#endif + xfs_ifork_t *ifp; /* inode fork ptr */ + int logflags; /* returned value */ + xfs_extnum_t nextents; /* number of extents in file now */ + + XFS_STATS_INC(xfsstats.xs_add_exlist); + cur = *curp; + ifp = XFS_IFORK_PTR(ip, whichfork); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(idx <= nextents); + da_old = da_new = 0; + error = 0; + /* + * This is the first extent added to a new/empty file. + * Special case this one, so other routines get to assume there are + * already extents in the list. + */ + if (nextents == 0) { + xfs_bmap_trace_insert(fname, "insert empty", ip, 0, 1, new, + NULL, whichfork); + xfs_bmap_insert_exlist(ip, 0, 1, new, whichfork); + ASSERT(cur == NULL); + ifp->if_lastex = 0; + if (!ISNULLSTARTBLOCK(new->br_startblock)) { + XFS_IFORK_NEXT_SET(ip, whichfork, 1); + logflags = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); + } else + logflags = 0; + } + /* + * Any kind of new delayed allocation goes here. + */ + else if (ISNULLSTARTBLOCK(new->br_startblock)) { + if (cur) + ASSERT((cur->bc_private.b.flags & + XFS_BTCUR_BPRV_WASDEL) == 0); + if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, cur, new, + &logflags, rsvd))) + goto done; + } + /* + * Real allocation off the end of the file. + */ + else if (idx == nextents) { + if (cur) + ASSERT((cur->bc_private.b.flags & + XFS_BTCUR_BPRV_WASDEL) == 0); + if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new, + &logflags, whichfork))) + goto done; + } else { + xfs_bmbt_irec_t prev; /* old extent at offset idx */ + + /* + * Get the record referred to by idx. + */ + xfs_bmbt_get_all(&ifp->if_u1.if_extents[idx], &prev); + /* + * If it's a real allocation record, and the new allocation ends + * after the start of the referred to record, then we're filling + * in a delayed or unwritten allocation with a real one, or + * converting real back to unwritten. + */ + if (!ISNULLSTARTBLOCK(new->br_startblock) && + new->br_startoff + new->br_blockcount > prev.br_startoff) { + if (prev.br_state != XFS_EXT_UNWRITTEN && + ISNULLSTARTBLOCK(prev.br_startblock)) { + da_old = STARTBLOCKVAL(prev.br_startblock); + if (cur) + ASSERT(cur->bc_private.b.flags & + XFS_BTCUR_BPRV_WASDEL); + if ((error = xfs_bmap_add_extent_delay_real(ip, + idx, &cur, new, &da_new, first, flist, + &logflags, rsvd))) + goto done; + } else if (new->br_state == XFS_EXT_NORM) { + ASSERT(new->br_state == XFS_EXT_NORM); + if ((error = xfs_bmap_add_extent_unwritten_real( + ip, idx, &cur, new, &logflags))) + goto done; + } else { + ASSERT(new->br_state == XFS_EXT_UNWRITTEN); + if ((error = xfs_bmap_add_extent_unwritten_real( + ip, idx, &cur, new, &logflags))) + goto done; + } + ASSERT(*curp == cur || *curp == NULL); + } + /* + * Otherwise we're filling in a hole with an allocation. + */ + else { + if (cur) + ASSERT((cur->bc_private.b.flags & + XFS_BTCUR_BPRV_WASDEL) == 0); + if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, + new, &logflags, whichfork))) + goto done; + } + } + + ASSERT(*curp == cur || *curp == NULL); + /* + * Convert to a btree if necessary. + */ + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(ip->i_transp, ip, first, + flist, &cur, da_old > 0, &tmp_logflags, whichfork); + logflags |= tmp_logflags; + if (error) + goto done; + } + /* + * Adjust for changes in reserved delayed indirect blocks. + * Nothing to do for disk quotas here. + */ + if (da_old || da_new) { + xfs_filblks_t nblks; + + nblks = da_new; + if (cur) + nblks += cur->bc_private.b.allocated; + ASSERT(nblks <= da_old); + if (nblks < da_old) + xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, + (int)(da_old - nblks), rsvd); + } + /* + * Clear out the allocated field, done with it now in any case. + */ + if (cur) { + cur->bc_private.b.allocated = 0; + *curp = cur; + } +done: +#ifdef XFSDEBUG + if (!error) + xfs_bmap_check_leaf_extents(*curp, ip, whichfork); +#endif + *logflagsp = logflags; + return error; +} + +/* + * Called by xfs_bmap_add_extent to handle cases converting a delayed + * allocation to a real allocation. + */ +STATIC int /* error */ +xfs_bmap_add_extent_delay_real( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* extent number to update/insert */ + xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to put in extent list */ + xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ + xfs_fsblock_t *first, /* pointer to firstblock variable */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + int *logflagsp, /* inode logging flags */ + int rsvd) /* OK to use reserved data block allocation */ +{ + xfs_bmbt_rec_t *base; /* base of extent entry list */ + xfs_btree_cur_t *cur; /* btree cursor */ + int diff; /* temp value */ + xfs_bmbt_rec_t *ep; /* extent entry for idx */ + int error; /* error return value */ +#ifdef XFS_BMAP_TRACE + static char fname[] = "xfs_bmap_add_extent_delay_real"; +#endif + int i; /* temp state */ + xfs_fileoff_t new_endoff; /* end offset of new entry */ + xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ + /* left is 0, right is 1, prev is 2 */ + int rval=0; /* return value (logging flags) */ + int state = 0;/* state bits, accessed thru macros */ + xfs_filblks_t temp; /* value for dnew calculations */ + xfs_filblks_t temp2; /* value for dnew calculations */ + int tmp_rval; /* partial logging flags */ + enum { /* bit number definitions for state */ + LEFT_CONTIG, RIGHT_CONTIG, + LEFT_FILLING, RIGHT_FILLING, + LEFT_DELAY, RIGHT_DELAY, + LEFT_VALID, RIGHT_VALID + }; + +#define LEFT r[0] +#define RIGHT r[1] +#define PREV r[2] +#define MASK(b) (1 << (b)) +#define MASK2(a,b) (MASK(a) | MASK(b)) +#define MASK3(a,b,c) (MASK2(a,b) | MASK(c)) +#define MASK4(a,b,c,d) (MASK3(a,b,c) | MASK(d)) +#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b))) +#define STATE_TEST(b) (state & MASK(b)) +#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \ + ((state &= ~MASK(b)), 0)) +#define SWITCH_STATE \ + (state & MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG)) + + /* + * Set up a bunch of variables to make the tests simpler. + */ + cur = *curp; + base = ip->i_df.if_u1.if_extents; + ep = &base[idx]; + xfs_bmbt_get_all(ep, &PREV); + new_endoff = new->br_startoff + new->br_blockcount; + ASSERT(PREV.br_startoff <= new->br_startoff); + ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); + /* + * Set flags determining what part of the previous delayed allocation + * extent is being replaced by a real allocation. + */ + STATE_SET(LEFT_FILLING, PREV.br_startoff == new->br_startoff); + STATE_SET(RIGHT_FILLING, + PREV.br_startoff + PREV.br_blockcount == new_endoff); + /* + * Check and set flags if this segment has a left neighbor. + * Don't set contiguous if the combined extent would be too large. + */ + if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { + xfs_bmbt_get_all(ep - 1, &LEFT); + STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock)); + } + STATE_SET(LEFT_CONTIG, + STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) && + LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && + LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && + LEFT.br_state == new->br_state && + LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN); + /* + * Check and set flags if this segment has a right neighbor. + * Don't set contiguous if the combined extent would be too large. + * Also check for all-three-contiguous being too large. + */ + if (STATE_SET_TEST(RIGHT_VALID, + idx < + ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) { + xfs_bmbt_get_all(ep + 1, &RIGHT); + STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock)); + } + STATE_SET(RIGHT_CONTIG, + STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) && + new_endoff == RIGHT.br_startoff && + new->br_startblock + new->br_blockcount == + RIGHT.br_startblock && + new->br_state == RIGHT.br_state && + new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + ((state & MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING)) != + MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING) || + LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount + <= MAXEXTLEN)); + error = 0; + /* + * Switch out based on the FILLING and CONTIG state bits. + */ + switch (SWITCH_STATE) { + + case MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): + /* + * Filling in all of a previously delayed allocation extent. + * The left and right neighbors are both contiguous with new. + */ + xfs_bmap_trace_pre_update(fname, "LF|RF|LC|RC", ip, idx - 1, + XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep - 1, + LEFT.br_blockcount + PREV.br_blockcount + + RIGHT.br_blockcount); + xfs_bmap_trace_post_update(fname, "LF|RF|LC|RC", ip, idx - 1, + XFS_DATA_FORK); + xfs_bmap_trace_delete(fname, "LF|RF|LC|RC", ip, idx, 2, + XFS_DATA_FORK); + xfs_bmap_delete_exlist(ip, idx, 2, XFS_DATA_FORK); + ip->i_df.if_lastex = idx - 1; + ip->i_d.di_nextents--; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + RIGHT.br_startblock, + RIGHT.br_blockcount, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_delete(cur, 0, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_decrement(cur, 0, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + + PREV.br_blockcount + + RIGHT.br_blockcount, LEFT.br_state))) + goto done; + } + *dnew = 0; + break; + + case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG): + /* + * Filling in all of a previously delayed allocation extent. + * The left neighbor is contiguous, the right is not. + */ + xfs_bmap_trace_pre_update(fname, "LF|RF|LC", ip, idx - 1, + XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep - 1, + LEFT.br_blockcount + PREV.br_blockcount); + xfs_bmap_trace_post_update(fname, "LF|RF|LC", ip, idx - 1, + XFS_DATA_FORK); + ip->i_df.if_lastex = idx - 1; + xfs_bmap_trace_delete(fname, "LF|RF|LC", ip, idx, 1, + XFS_DATA_FORK); + xfs_bmap_delete_exlist(ip, idx, 1, XFS_DATA_FORK); + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff, + LEFT.br_startblock, LEFT.br_blockcount, + &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + + PREV.br_blockcount, LEFT.br_state))) + goto done; + } + *dnew = 0; + break; + + case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG): + /* + * Filling in all of a previously delayed allocation extent. + * The right neighbor is contiguous, the left is not. + */ + xfs_bmap_trace_pre_update(fname, "LF|RF|RC", ip, idx, + XFS_DATA_FORK); + xfs_bmbt_set_startblock(ep, new->br_startblock); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount + RIGHT.br_blockcount); + xfs_bmap_trace_post_update(fname, "LF|RF|RC", ip, idx, + XFS_DATA_FORK); + ip->i_df.if_lastex = idx; + xfs_bmap_trace_delete(fname, "LF|RF|RC", ip, idx + 1, 1, + XFS_DATA_FORK); + xfs_bmap_delete_exlist(ip, idx + 1, 1, XFS_DATA_FORK); + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + RIGHT.br_startblock, + RIGHT.br_blockcount, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, PREV.br_startoff, + new->br_startblock, + PREV.br_blockcount + + RIGHT.br_blockcount, PREV.br_state))) + goto done; + } + *dnew = 0; + break; + + case MASK2(LEFT_FILLING, RIGHT_FILLING): + /* + * Filling in all of a previously delayed allocation extent. + * Neither the left nor right neighbors are contiguous with + * the new one. + */ + xfs_bmap_trace_pre_update(fname, "LF|RF", ip, idx, + XFS_DATA_FORK); + xfs_bmbt_set_startblock(ep, new->br_startblock); + xfs_bmap_trace_post_update(fname, "LF|RF", ip, idx, + XFS_DATA_FORK); + ip->i_df.if_lastex = idx; + ip->i_d.di_nextents++; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) + goto done; + ASSERT(i == 0); + cur->bc_rec.b.br_state = XFS_EXT_NORM; + if ((error = xfs_bmbt_insert(cur, &i))) + goto done; + ASSERT(i == 1); + } + *dnew = 0; + break; + + case MASK2(LEFT_FILLING, LEFT_CONTIG): + /* + * Filling in the first part of a previous delayed allocation. + * The left neighbor is contiguous. + */ + xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx - 1, + XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep - 1, + LEFT.br_blockcount + new->br_blockcount); + xfs_bmbt_set_startoff(ep, + PREV.br_startoff + new->br_blockcount); + xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx - 1, + XFS_DATA_FORK); + temp = PREV.br_blockcount - new->br_blockcount; + xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx, + XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep, temp); + ip->i_df.if_lastex = idx - 1; + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff, + LEFT.br_startblock, LEFT.br_blockcount, + &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + + new->br_blockcount, + LEFT.br_state))) + goto done; + } + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + STARTBLOCKVAL(PREV.br_startblock)); + xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx, + XFS_DATA_FORK); + *dnew = temp; + break; + + case MASK(LEFT_FILLING): + /* + * Filling in the first part of a previous delayed allocation. + * The left neighbor is not contiguous. + */ + xfs_bmap_trace_pre_update(fname, "LF", ip, idx, XFS_DATA_FORK); + xfs_bmbt_set_startoff(ep, new_endoff); + temp = PREV.br_blockcount - new->br_blockcount; + xfs_bmbt_set_blockcount(ep, temp); + xfs_bmap_trace_insert(fname, "LF", ip, idx, 1, new, NULL, + XFS_DATA_FORK); + xfs_bmap_insert_exlist(ip, idx, 1, new, XFS_DATA_FORK); + ip->i_df.if_lastex = idx; + ip->i_d.di_nextents++; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) + goto done; + ASSERT(i == 0); + cur->bc_rec.b.br_state = XFS_EXT_NORM; + if ((error = xfs_bmbt_insert(cur, &i))) + goto done; + ASSERT(i == 1); + } + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_nextents > ip->i_df.if_ext_max) { + error = xfs_bmap_extents_to_btree(ip->i_transp, ip, + first, flist, &cur, 1, &tmp_rval, + XFS_DATA_FORK); + rval |= tmp_rval; + if (error) + goto done; + } + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + STARTBLOCKVAL(PREV.br_startblock) - + (cur ? cur->bc_private.b.allocated : 0)); + base = ip->i_df.if_u1.if_extents; + ep = &base[idx + 1]; + xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmap_trace_post_update(fname, "LF", ip, idx + 1, + XFS_DATA_FORK); + *dnew = temp; + break; + + case MASK2(RIGHT_FILLING, RIGHT_CONTIG): + /* + * Filling in the last part of a previous delayed allocation. + * The right neighbor is contiguous with the new allocation. + */ + temp = PREV.br_blockcount - new->br_blockcount; + xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx, + XFS_DATA_FORK); + xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx + 1, + XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep, temp); + xfs_bmbt_set_allf(ep + 1, new->br_startoff, new->br_startblock, + new->br_blockcount + RIGHT.br_blockcount, + RIGHT.br_state); + xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx + 1, + XFS_DATA_FORK); + ip->i_df.if_lastex = idx + 1; + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + RIGHT.br_startblock, + RIGHT.br_blockcount, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, new->br_startoff, + new->br_startblock, + new->br_blockcount + + RIGHT.br_blockcount, + RIGHT.br_state))) + goto done; + } + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + STARTBLOCKVAL(PREV.br_startblock)); + xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx, + XFS_DATA_FORK); + *dnew = temp; + break; + + case MASK(RIGHT_FILLING): + /* + * Filling in the last part of a previous delayed allocation. + * The right neighbor is not contiguous. + */ + temp = PREV.br_blockcount - new->br_blockcount; + xfs_bmap_trace_pre_update(fname, "RF", ip, idx, XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep, temp); + xfs_bmap_trace_insert(fname, "RF", ip, idx + 1, 1, + new, NULL, XFS_DATA_FORK); + xfs_bmap_insert_exlist(ip, idx + 1, 1, new, XFS_DATA_FORK); + ip->i_df.if_lastex = idx + 1; + ip->i_d.di_nextents++; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) + goto done; + ASSERT(i == 0); + cur->bc_rec.b.br_state = XFS_EXT_NORM; + if ((error = xfs_bmbt_insert(cur, &i))) + goto done; + ASSERT(i == 1); + } + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_nextents > ip->i_df.if_ext_max) { + error = xfs_bmap_extents_to_btree(ip->i_transp, ip, + first, flist, &cur, 1, &tmp_rval, + XFS_DATA_FORK); + rval |= tmp_rval; + if (error) + goto done; + } + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + STARTBLOCKVAL(PREV.br_startblock) - + (cur ? cur->bc_private.b.allocated : 0)); + base = ip->i_df.if_u1.if_extents; + ep = &base[idx]; + xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmap_trace_post_update(fname, "RF", ip, idx, XFS_DATA_FORK); + *dnew = temp; + break; + + case 0: + /* + * Filling in the middle part of a previous delayed allocation. + * Contiguity is impossible here. + * This case is avoided almost all the time. + */ + temp = new->br_startoff - PREV.br_startoff; + xfs_bmap_trace_pre_update(fname, "0", ip, idx, XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep, temp); + r[0] = *new; + r[1].br_startoff = new_endoff; + temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; + r[1].br_blockcount = temp2; + xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 2, &r[0], &r[1], + XFS_DATA_FORK); + xfs_bmap_insert_exlist(ip, idx + 1, 2, &r[0], XFS_DATA_FORK); + ip->i_df.if_lastex = idx + 1; + ip->i_d.di_nextents++; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) + goto done; + ASSERT(i == 0); + cur->bc_rec.b.br_state = XFS_EXT_NORM; + if ((error = xfs_bmbt_insert(cur, &i))) + goto done; + ASSERT(i == 1); + } + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_nextents > ip->i_df.if_ext_max) { + error = xfs_bmap_extents_to_btree(ip->i_transp, ip, + first, flist, &cur, 1, &tmp_rval, + XFS_DATA_FORK); + rval |= tmp_rval; + if (error) + goto done; + } + temp = xfs_bmap_worst_indlen(ip, temp); + temp2 = xfs_bmap_worst_indlen(ip, temp2); + diff = (int)(temp + temp2 - STARTBLOCKVAL(PREV.br_startblock) - + (cur ? cur->bc_private.b.allocated : 0)); + if (diff > 0 && + xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -diff, rsvd)) { + /* + * Ick gross gag me with a spoon. + */ + ASSERT(0); /* want to see if this ever happens! */ + while (diff > 0) { + if (temp) { + temp--; + diff--; + if (!diff || + !xfs_mod_incore_sb(ip->i_mount, + XFS_SBS_FDBLOCKS, -diff, rsvd)) + break; + } + if (temp2) { + temp2--; + diff--; + if (!diff || + !xfs_mod_incore_sb(ip->i_mount, + XFS_SBS_FDBLOCKS, -diff, rsvd)) + break; + } + } + } + base = ip->i_df.if_u1.if_extents; + ep = &base[idx]; + xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmap_trace_post_update(fname, "0", ip, idx, XFS_DATA_FORK); + xfs_bmap_trace_pre_update(fname, "0", ip, idx + 2, + XFS_DATA_FORK); + xfs_bmbt_set_startblock(ep + 2, NULLSTARTBLOCK((int)temp2)); + xfs_bmap_trace_post_update(fname, "0", ip, idx + 2, + XFS_DATA_FORK); + *dnew = temp + temp2; + break; + + case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): + case MASK3(RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): + case MASK2(LEFT_FILLING, RIGHT_CONTIG): + case MASK2(RIGHT_FILLING, LEFT_CONTIG): + case MASK2(LEFT_CONTIG, RIGHT_CONTIG): + case MASK(LEFT_CONTIG): + case MASK(RIGHT_CONTIG): + /* + * These cases are all impossible. + */ + ASSERT(0); + } + *curp = cur; +done: + *logflagsp = rval; + return error; +#undef LEFT +#undef RIGHT +#undef PREV +#undef MASK +#undef MASK2 +#undef MASK3 +#undef MASK4 +#undef STATE_SET +#undef STATE_TEST +#undef STATE_SET_TEST +#undef SWITCH_STATE +} + +/* + * Called by xfs_bmap_add_extent to handle cases converting an unwritten + * allocation to a real allocation or vice versa. + */ +STATIC int /* error */ +xfs_bmap_add_extent_unwritten_real( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* extent number to update/insert */ + xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to put in extent list */ + int *logflagsp) /* inode logging flags */ +{ + xfs_bmbt_rec_t *base; /* base of extent entry list */ + xfs_btree_cur_t *cur; /* btree cursor */ + xfs_bmbt_rec_t *ep; /* extent entry for idx */ + int error; /* error return value */ +#ifdef XFS_BMAP_TRACE + static char fname[] = "xfs_bmap_add_extent_unwritten_real"; +#endif + int i; /* temp state */ + xfs_fileoff_t new_endoff; /* end offset of new entry */ + xfs_exntst_t newext; /* new extent state */ + xfs_exntst_t oldext; /* old extent state */ + xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ + /* left is 0, right is 1, prev is 2 */ + int rval=0; /* return value (logging flags) */ + int state = 0;/* state bits, accessed thru macros */ + enum { /* bit number definitions for state */ + LEFT_CONTIG, RIGHT_CONTIG, + LEFT_FILLING, RIGHT_FILLING, + LEFT_DELAY, RIGHT_DELAY, + LEFT_VALID, RIGHT_VALID + }; + +#define LEFT r[0] +#define RIGHT r[1] +#define PREV r[2] +#define MASK(b) (1 << (b)) +#define MASK2(a,b) (MASK(a) | MASK(b)) +#define MASK3(a,b,c) (MASK2(a,b) | MASK(c)) +#define MASK4(a,b,c,d) (MASK3(a,b,c) | MASK(d)) +#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b))) +#define STATE_TEST(b) (state & MASK(b)) +#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \ + ((state &= ~MASK(b)), 0)) +#define SWITCH_STATE \ + (state & MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG)) + + /* + * Set up a bunch of variables to make the tests simpler. + */ + error = 0; + cur = *curp; + base = ip->i_df.if_u1.if_extents; + ep = &base[idx]; + xfs_bmbt_get_all(ep, &PREV); + newext = new->br_state; + oldext = (newext == XFS_EXT_UNWRITTEN) ? + XFS_EXT_NORM : XFS_EXT_UNWRITTEN; + ASSERT(PREV.br_state == oldext); + new_endoff = new->br_startoff + new->br_blockcount; + ASSERT(PREV.br_startoff <= new->br_startoff); + ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); + /* + * Set flags determining what part of the previous oldext allocation + * extent is being replaced by a newext allocation. + */ + STATE_SET(LEFT_FILLING, PREV.br_startoff == new->br_startoff); + STATE_SET(RIGHT_FILLING, + PREV.br_startoff + PREV.br_blockcount == new_endoff); + /* + * Check and set flags if this segment has a left neighbor. + * Don't set contiguous if the combined extent would be too large. + */ + if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { + xfs_bmbt_get_all(ep - 1, &LEFT); + STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock)); + } + STATE_SET(LEFT_CONTIG, + STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) && + LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && + LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && + LEFT.br_state == newext && + LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN); + /* + * Check and set flags if this segment has a right neighbor. + * Don't set contiguous if the combined extent would be too large. + * Also check for all-three-contiguous being too large. + */ + if (STATE_SET_TEST(RIGHT_VALID, + idx < + ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) { + xfs_bmbt_get_all(ep + 1, &RIGHT); + STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock)); + } + STATE_SET(RIGHT_CONTIG, + STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) && + new_endoff == RIGHT.br_startoff && + new->br_startblock + new->br_blockcount == + RIGHT.br_startblock && + newext == RIGHT.br_state && + new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + ((state & MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING)) != + MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING) || + LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount + <= MAXEXTLEN)); + /* + * Switch out based on the FILLING and CONTIG state bits. + */ + switch (SWITCH_STATE) { + + case MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): + /* + * Setting all of a previous oldext extent to newext. + * The left and right neighbors are both contiguous with new. + */ + xfs_bmap_trace_pre_update(fname, "LF|RF|LC|RC", ip, idx - 1, + XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep - 1, + LEFT.br_blockcount + PREV.br_blockcount + + RIGHT.br_blockcount); + xfs_bmap_trace_post_update(fname, "LF|RF|LC|RC", ip, idx - 1, + XFS_DATA_FORK); + xfs_bmap_trace_delete(fname, "LF|RF|LC|RC", ip, idx, 2, + XFS_DATA_FORK); + xfs_bmap_delete_exlist(ip, idx, 2, XFS_DATA_FORK); + ip->i_df.if_lastex = idx - 1; + ip->i_d.di_nextents -= 2; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + RIGHT.br_startblock, + RIGHT.br_blockcount, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_delete(cur, 0, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_decrement(cur, 0, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_delete(cur, 0, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_decrement(cur, 0, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + PREV.br_blockcount + + RIGHT.br_blockcount, LEFT.br_state))) + goto done; + } + break; + + case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG): + /* + * Setting all of a previous oldext extent to newext. + * The left neighbor is contiguous, the right is not. + */ + xfs_bmap_trace_pre_update(fname, "LF|RF|LC", ip, idx - 1, + XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep - 1, + LEFT.br_blockcount + PREV.br_blockcount); + xfs_bmap_trace_post_update(fname, "LF|RF|LC", ip, idx - 1, + XFS_DATA_FORK); + ip->i_df.if_lastex = idx - 1; + xfs_bmap_trace_delete(fname, "LF|RF|LC", ip, idx, 1, + XFS_DATA_FORK); + xfs_bmap_delete_exlist(ip, idx, 1, XFS_DATA_FORK); + ip->i_d.di_nextents--; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, PREV.br_blockcount, + &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_delete(cur, 0, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_decrement(cur, 0, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + PREV.br_blockcount, + LEFT.br_state))) + goto done; + } + break; + + case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG): + /* + * Setting all of a previous oldext extent to newext. + * The right neighbor is contiguous, the left is not. + */ + xfs_bmap_trace_pre_update(fname, "LF|RF|RC", ip, idx, + XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount + RIGHT.br_blockcount); + xfs_bmbt_set_state(ep, newext); + xfs_bmap_trace_post_update(fname, "LF|RF|RC", ip, idx, + XFS_DATA_FORK); + ip->i_df.if_lastex = idx; + xfs_bmap_trace_delete(fname, "LF|RF|RC", ip, idx + 1, 1, + XFS_DATA_FORK); + xfs_bmap_delete_exlist(ip, idx + 1, 1, XFS_DATA_FORK); + ip->i_d.di_nextents--; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + RIGHT.br_startblock, + RIGHT.br_blockcount, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_delete(cur, 0, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_decrement(cur, 0, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, new->br_startoff, + new->br_startblock, + new->br_blockcount + RIGHT.br_blockcount, + newext))) + goto done; + } + break; + + case MASK2(LEFT_FILLING, RIGHT_FILLING): + /* + * Setting all of a previous oldext extent to newext. + * Neither the left nor right neighbors are contiguous with + * the new one. + */ + xfs_bmap_trace_pre_update(fname, "LF|RF", ip, idx, + XFS_DATA_FORK); + xfs_bmbt_set_state(ep, newext); + xfs_bmap_trace_post_update(fname, "LF|RF", ip, idx, + XFS_DATA_FORK); + ip->i_df.if_lastex = idx; + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + newext))) + goto done; + } + break; + + case MASK2(LEFT_FILLING, LEFT_CONTIG): + /* + * Setting the first part of a previous oldext extent to newext. + * The left neighbor is contiguous. + */ + xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx - 1, + XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep - 1, + LEFT.br_blockcount + new->br_blockcount); + xfs_bmbt_set_startoff(ep, + PREV.br_startoff + new->br_blockcount); + xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx - 1, + XFS_DATA_FORK); + xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx, + XFS_DATA_FORK); + xfs_bmbt_set_startblock(ep, + new->br_startblock + new->br_blockcount); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount - new->br_blockcount); + xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx, + XFS_DATA_FORK); + ip->i_df.if_lastex = idx - 1; + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, PREV.br_blockcount, + &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, + PREV.br_startoff + new->br_blockcount, + PREV.br_startblock + new->br_blockcount, + PREV.br_blockcount - new->br_blockcount, + oldext))) + goto done; + if ((error = xfs_bmbt_decrement(cur, 0, &i))) + goto done; + if (xfs_bmbt_update(cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + new->br_blockcount, + LEFT.br_state)) + goto done; + } + break; + + case MASK(LEFT_FILLING): + /* + * Setting the first part of a previous oldext extent to newext. + * The left neighbor is not contiguous. + */ + xfs_bmap_trace_pre_update(fname, "LF", ip, idx, XFS_DATA_FORK); + ASSERT(ep && xfs_bmbt_get_state(ep) == oldext); + xfs_bmbt_set_startoff(ep, new_endoff); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount - new->br_blockcount); + xfs_bmbt_set_startblock(ep, + new->br_startblock + new->br_blockcount); + xfs_bmap_trace_post_update(fname, "LF", ip, idx, XFS_DATA_FORK); + xfs_bmap_trace_insert(fname, "LF", ip, idx, 1, new, NULL, + XFS_DATA_FORK); + xfs_bmap_insert_exlist(ip, idx, 1, new, XFS_DATA_FORK); + ip->i_df.if_lastex = idx; + ip->i_d.di_nextents++; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, PREV.br_blockcount, + &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, + PREV.br_startoff + new->br_blockcount, + PREV.br_startblock + new->br_blockcount, + PREV.br_blockcount - new->br_blockcount, + oldext))) + goto done; + cur->bc_rec.b = *new; + if ((error = xfs_bmbt_insert(cur, &i))) + goto done; + ASSERT(i == 1); + } + break; + + case MASK2(RIGHT_FILLING, RIGHT_CONTIG): + /* + * Setting the last part of a previous oldext extent to newext. + * The right neighbor is contiguous with the new allocation. + */ + xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx, + XFS_DATA_FORK); + xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx + 1, + XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount - new->br_blockcount); + xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx, + XFS_DATA_FORK); + xfs_bmbt_set_allf(ep + 1, new->br_startoff, new->br_startblock, + new->br_blockcount + RIGHT.br_blockcount, newext); + xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx + 1, + XFS_DATA_FORK); + ip->i_df.if_lastex = idx + 1; + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, + PREV.br_blockcount, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, PREV.br_startoff, + PREV.br_startblock, + PREV.br_blockcount - new->br_blockcount, + oldext))) + goto done; + if ((error = xfs_bmbt_increment(cur, 0, &i))) + goto done; + if ((error = xfs_bmbt_update(cur, new->br_startoff, + new->br_startblock, + new->br_blockcount + RIGHT.br_blockcount, + newext))) + goto done; + } + break; + + case MASK(RIGHT_FILLING): + /* + * Setting the last part of a previous oldext extent to newext. + * The right neighbor is not contiguous. + */ + xfs_bmap_trace_pre_update(fname, "RF", ip, idx, XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount - new->br_blockcount); + xfs_bmap_trace_post_update(fname, "RF", ip, idx, XFS_DATA_FORK); + xfs_bmap_trace_insert(fname, "RF", ip, idx + 1, 1, + new, NULL, XFS_DATA_FORK); + xfs_bmap_insert_exlist(ip, idx + 1, 1, new, XFS_DATA_FORK); + ip->i_df.if_lastex = idx + 1; + ip->i_d.di_nextents++; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, PREV.br_blockcount, + &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, PREV.br_startoff, + PREV.br_startblock, + PREV.br_blockcount - new->br_blockcount, + oldext))) + goto done; + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) + goto done; + ASSERT(i == 0); + cur->bc_rec.b.br_state = XFS_EXT_NORM; + if ((error = xfs_bmbt_insert(cur, &i))) + goto done; + ASSERT(i == 1); + } + break; + + case 0: + /* + * Setting the middle part of a previous oldext extent to + * newext. Contiguity is impossible here. + * One extent becomes three extents. + */ + xfs_bmap_trace_pre_update(fname, "0", ip, idx, XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep, + new->br_startoff - PREV.br_startoff); + xfs_bmap_trace_post_update(fname, "0", ip, idx, XFS_DATA_FORK); + r[0] = *new; + r[1].br_startoff = new_endoff; + r[1].br_blockcount = + PREV.br_startoff + PREV.br_blockcount - new_endoff; + r[1].br_startblock = new->br_startblock + new->br_blockcount; + r[1].br_state = oldext; + xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 2, &r[0], &r[1], + XFS_DATA_FORK); + xfs_bmap_insert_exlist(ip, idx + 1, 2, &r[0], XFS_DATA_FORK); + ip->i_df.if_lastex = idx + 1; + ip->i_d.di_nextents += 2; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, PREV.br_blockcount, + &i))) + goto done; + ASSERT(i == 1); + /* new right extent - oldext */ + if ((error = xfs_bmbt_update(cur, r[1].br_startoff, + r[1].br_startblock, r[1].br_blockcount, + r[1].br_state))) + goto done; + /* new left extent - oldext */ + PREV.br_blockcount = + new->br_startoff - PREV.br_startoff; + cur->bc_rec.b = PREV; + if ((error = xfs_bmbt_insert(cur, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_increment(cur, 0, &i))) + goto done; + ASSERT(i == 1); + /* new middle extent - newext */ + cur->bc_rec.b = *new; + if ((error = xfs_bmbt_insert(cur, &i))) + goto done; + ASSERT(i == 1); + } + break; + + case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): + case MASK3(RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): + case MASK2(LEFT_FILLING, RIGHT_CONTIG): + case MASK2(RIGHT_FILLING, LEFT_CONTIG): + case MASK2(LEFT_CONTIG, RIGHT_CONTIG): + case MASK(LEFT_CONTIG): + case MASK(RIGHT_CONTIG): + /* + * These cases are all impossible. + */ + ASSERT(0); + } + *curp = cur; +done: + *logflagsp = rval; + return error; +#undef LEFT +#undef RIGHT +#undef PREV +#undef MASK +#undef MASK2 +#undef MASK3 +#undef MASK4 +#undef STATE_SET +#undef STATE_TEST +#undef STATE_SET_TEST +#undef SWITCH_STATE +} + +/* + * Called by xfs_bmap_add_extent to handle cases converting a hole + * to a delayed allocation. + */ +/*ARGSUSED*/ +STATIC int /* error */ +xfs_bmap_add_extent_hole_delay( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* extent number to update/insert */ + xfs_btree_cur_t *cur, /* if null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to put in extent list */ + int *logflagsp, /* inode logging flags */ + int rsvd) /* OK to allocate reserved blocks */ +{ + xfs_bmbt_rec_t *base; /* base of extent entry list */ + xfs_bmbt_rec_t *ep; /* extent list entry for idx */ +#ifdef XFS_BMAP_TRACE + static char fname[] = "xfs_bmap_add_extent_hole_delay"; +#endif + xfs_bmbt_irec_t left; /* left neighbor extent entry */ + xfs_filblks_t newlen=0; /* new indirect size */ + xfs_filblks_t oldlen=0; /* old indirect size */ + xfs_bmbt_irec_t right; /* right neighbor extent entry */ + int state; /* state bits, accessed thru macros */ + xfs_filblks_t temp; /* temp for indirect calculations */ + enum { /* bit number definitions for state */ + LEFT_CONTIG, RIGHT_CONTIG, + LEFT_DELAY, RIGHT_DELAY, + LEFT_VALID, RIGHT_VALID + }; + +#define MASK(b) (1 << (b)) +#define MASK2(a,b) (MASK(a) | MASK(b)) +#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b))) +#define STATE_TEST(b) (state & MASK(b)) +#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \ + ((state &= ~MASK(b)), 0)) +#define SWITCH_STATE (state & MASK2(LEFT_CONTIG, RIGHT_CONTIG)) + + base = ip->i_df.if_u1.if_extents; + ep = &base[idx]; + state = 0; + ASSERT(ISNULLSTARTBLOCK(new->br_startblock)); + /* + * Check and set flags if this segment has a left neighbor + */ + if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { + xfs_bmbt_get_all(ep - 1, &left); + STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock)); + } + /* + * Check and set flags if the current (right) segment exists. + * If it doesn't exist, we're converting the hole at end-of-file. + */ + if (STATE_SET_TEST(RIGHT_VALID, + idx < + ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { + xfs_bmbt_get_all(ep, &right); + STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock)); + } + /* + * Set contiguity flags on the left and right neighbors. + * Don't let extents get too large, even if the pieces are contiguous. + */ + STATE_SET(LEFT_CONTIG, + STATE_TEST(LEFT_VALID) && STATE_TEST(LEFT_DELAY) && + left.br_startoff + left.br_blockcount == new->br_startoff && + left.br_blockcount + new->br_blockcount <= MAXEXTLEN); + STATE_SET(RIGHT_CONTIG, + STATE_TEST(RIGHT_VALID) && STATE_TEST(RIGHT_DELAY) && + new->br_startoff + new->br_blockcount == right.br_startoff && + new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + (!STATE_TEST(LEFT_CONTIG) || + (left.br_blockcount + new->br_blockcount + + right.br_blockcount <= MAXEXTLEN))); + /* + * Switch out based on the contiguity flags. + */ + switch (SWITCH_STATE) { + + case MASK2(LEFT_CONTIG, RIGHT_CONTIG): + /* + * New allocation is contiguous with delayed allocations + * on the left and on the right. + * Merge all three into a single extent list entry. + */ + temp = left.br_blockcount + new->br_blockcount + + right.br_blockcount; + xfs_bmap_trace_pre_update(fname, "LC|RC", ip, idx - 1, + XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep - 1, temp); + oldlen = STARTBLOCKVAL(left.br_startblock) + + STARTBLOCKVAL(new->br_startblock) + + STARTBLOCKVAL(right.br_startblock); + newlen = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_startblock(ep - 1, NULLSTARTBLOCK((int)newlen)); + xfs_bmap_trace_post_update(fname, "LC|RC", ip, idx - 1, + XFS_DATA_FORK); + xfs_bmap_trace_delete(fname, "LC|RC", ip, idx, 1, + XFS_DATA_FORK); + xfs_bmap_delete_exlist(ip, idx, 1, XFS_DATA_FORK); + ip->i_df.if_lastex = idx - 1; + break; + + case MASK(LEFT_CONTIG): + /* + * New allocation is contiguous with a delayed allocation + * on the left. + * Merge the new allocation with the left neighbor. + */ + temp = left.br_blockcount + new->br_blockcount; + xfs_bmap_trace_pre_update(fname, "LC", ip, idx - 1, + XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep - 1, temp); + oldlen = STARTBLOCKVAL(left.br_startblock) + + STARTBLOCKVAL(new->br_startblock); + newlen = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_startblock(ep - 1, NULLSTARTBLOCK((int)newlen)); + xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1, + XFS_DATA_FORK); + ip->i_df.if_lastex = idx - 1; + break; + + case MASK(RIGHT_CONTIG): + /* + * New allocation is contiguous with a delayed allocation + * on the right. + * Merge the new allocation with the right neighbor. + */ + xfs_bmap_trace_pre_update(fname, "RC", ip, idx, XFS_DATA_FORK); + temp = new->br_blockcount + right.br_blockcount; + oldlen = STARTBLOCKVAL(new->br_startblock) + + STARTBLOCKVAL(right.br_startblock); + newlen = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_allf(ep, new->br_startoff, + NULLSTARTBLOCK((int)newlen), temp, right.br_state); + xfs_bmap_trace_post_update(fname, "RC", ip, idx, XFS_DATA_FORK); + ip->i_df.if_lastex = idx; + break; + + case 0: + /* + * New allocation is not contiguous with another + * delayed allocation. + * Insert a new entry. + */ + oldlen = newlen = 0; + xfs_bmap_trace_insert(fname, "0", ip, idx, 1, new, NULL, + XFS_DATA_FORK); + xfs_bmap_insert_exlist(ip, idx, 1, new, XFS_DATA_FORK); + ip->i_df.if_lastex = idx; + break; + } + if (oldlen != newlen) { + ASSERT(oldlen > newlen); + xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, + (int)(oldlen - newlen), rsvd); + /* + * Nothing to do for disk quota accounting here. + */ + } + *logflagsp = 0; + return 0; +#undef MASK +#undef MASK2 +#undef STATE_SET +#undef STATE_TEST +#undef STATE_SET_TEST +#undef SWITCH_STATE +} + +/* + * Called by xfs_bmap_add_extent to handle cases converting a hole + * to a real allocation. + */ +STATIC int /* error */ +xfs_bmap_add_extent_hole_real( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* extent number to update/insert */ + xfs_btree_cur_t *cur, /* if null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to put in extent list */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_t *ep; /* pointer to extent entry ins. point */ + int error; /* error return value */ +#ifdef XFS_BMAP_TRACE + static char fname[] = "xfs_bmap_add_extent_hole_real"; +#endif + int i; /* temp state */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_irec_t left; /* left neighbor extent entry */ + xfs_bmbt_irec_t right; /* right neighbor extent entry */ + int state; /* state bits, accessed thru macros */ + enum { /* bit number definitions for state */ + LEFT_CONTIG, RIGHT_CONTIG, + LEFT_DELAY, RIGHT_DELAY, + LEFT_VALID, RIGHT_VALID + }; + +#define MASK(b) (1 << (b)) +#define MASK2(a,b) (MASK(a) | MASK(b)) +#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b))) +#define STATE_TEST(b) (state & MASK(b)) +#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \ + ((state &= ~MASK(b)), 0)) +#define SWITCH_STATE (state & MASK2(LEFT_CONTIG, RIGHT_CONTIG)) + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); + ep = &ifp->if_u1.if_extents[idx]; + state = 0; + /* + * Check and set flags if this segment has a left neighbor. + */ + if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { + xfs_bmbt_get_all(ep - 1, &left); + STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock)); + } + /* + * Check and set flags if this segment has a current value. + * Not true if we're inserting into the "hole" at eof. + */ + if (STATE_SET_TEST(RIGHT_VALID, + idx < + ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { + xfs_bmbt_get_all(ep, &right); + STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock)); + } + /* + * We're inserting a real allocation between "left" and "right". + * Set the contiguity flags. Don't let extents get too large. + */ + STATE_SET(LEFT_CONTIG, + STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) && + left.br_startoff + left.br_blockcount == new->br_startoff && + left.br_startblock + left.br_blockcount == new->br_startblock && + left.br_state == new->br_state && + left.br_blockcount + new->br_blockcount <= MAXEXTLEN); + STATE_SET(RIGHT_CONTIG, + STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) && + new->br_startoff + new->br_blockcount == right.br_startoff && + new->br_startblock + new->br_blockcount == + right.br_startblock && + new->br_state == right.br_state && + new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + (!STATE_TEST(LEFT_CONTIG) || + left.br_blockcount + new->br_blockcount + + right.br_blockcount <= MAXEXTLEN)); + + /* + * Select which case we're in here, and implement it. + */ + switch (SWITCH_STATE) { + + case MASK2(LEFT_CONTIG, RIGHT_CONTIG): + /* + * New allocation is contiguous with real allocations on the + * left and on the right. + * Merge all three into a single extent list entry. + */ + xfs_bmap_trace_pre_update(fname, "LC|RC", ip, idx - 1, + whichfork); + xfs_bmbt_set_blockcount(ep - 1, + left.br_blockcount + new->br_blockcount + + right.br_blockcount); + xfs_bmap_trace_post_update(fname, "LC|RC", ip, idx - 1, + whichfork); + xfs_bmap_trace_delete(fname, "LC|RC", ip, + idx, 1, whichfork); + xfs_bmap_delete_exlist(ip, idx, 1, whichfork); + ifp->if_lastex = idx - 1; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + if (cur == NULL) { + *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); + return 0; + } + *logflagsp = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, right.br_startoff, + right.br_startblock, right.br_blockcount, &i))) + return error; + ASSERT(i == 1); + if ((error = xfs_bmbt_delete(cur, 0, &i))) + return error; + ASSERT(i == 1); + if ((error = xfs_bmbt_decrement(cur, 0, &i))) + return error; + ASSERT(i == 1); + error = xfs_bmbt_update(cur, left.br_startoff, + left.br_startblock, + left.br_blockcount + new->br_blockcount + + right.br_blockcount, left.br_state); + return error; + + case MASK(LEFT_CONTIG): + /* + * New allocation is contiguous with a real allocation + * on the left. + * Merge the new allocation with the left neighbor. + */ + xfs_bmap_trace_pre_update(fname, "LC", ip, idx - 1, whichfork); + xfs_bmbt_set_blockcount(ep - 1, + left.br_blockcount + new->br_blockcount); + xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1, whichfork); + ifp->if_lastex = idx - 1; + if (cur == NULL) { + *logflagsp = XFS_ILOG_FEXT(whichfork); + return 0; + } + *logflagsp = 0; + if ((error = xfs_bmbt_lookup_eq(cur, left.br_startoff, + left.br_startblock, left.br_blockcount, &i))) + return error; + ASSERT(i == 1); + error = xfs_bmbt_update(cur, left.br_startoff, + left.br_startblock, + left.br_blockcount + new->br_blockcount, + left.br_state); + return error; + + case MASK(RIGHT_CONTIG): + /* + * New allocation is contiguous with a real allocation + * on the right. + * Merge the new allocation with the right neighbor. + */ + xfs_bmap_trace_pre_update(fname, "RC", ip, idx, whichfork); + xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock, + new->br_blockcount + right.br_blockcount, + right.br_state); + xfs_bmap_trace_post_update(fname, "RC", ip, idx, whichfork); + ifp->if_lastex = idx; + if (cur == NULL) { + *logflagsp = XFS_ILOG_FEXT(whichfork); + return 0; + } + *logflagsp = 0; + if ((error = xfs_bmbt_lookup_eq(cur, right.br_startoff, + right.br_startblock, right.br_blockcount, &i))) + return error; + ASSERT(i == 1); + error = xfs_bmbt_update(cur, new->br_startoff, + new->br_startblock, + new->br_blockcount + right.br_blockcount, + right.br_state); + return error; + + case 0: + /* + * New allocation is not contiguous with another + * real allocation. + * Insert a new entry. + */ + xfs_bmap_trace_insert(fname, "0", ip, idx, 1, new, NULL, + whichfork); + xfs_bmap_insert_exlist(ip, idx, 1, new, whichfork); + ifp->if_lastex = idx; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + if (cur == NULL) { + *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); + return 0; + } + *logflagsp = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, &i))) + return error; + ASSERT(i == 0); + cur->bc_rec.b.br_state = new->br_state; + if ((error = xfs_bmbt_insert(cur, &i))) + return error; + ASSERT(i == 1); + return 0; + } +#undef MASK +#undef MASK2 +#undef STATE_SET +#undef STATE_TEST +#undef STATE_SET_TEST +#undef SWITCH_STATE + /* NOTREACHED */ + ASSERT(0); + return 0; /* keep gcc quite */ +} + +#define XFS_ALLOC_GAP_UNITS 4 + +/* + * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. + * It figures out where to ask the underlying allocator to put the new extent. + */ +STATIC int /* error */ +xfs_bmap_alloc( + xfs_bmalloca_t *ap) /* bmap alloc argument struct */ +{ + xfs_fsblock_t adjust; /* adjustment to block numbers */ + xfs_alloctype_t atype=0; /* type for allocation routines */ + int error; /* error return value */ + xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ + xfs_mount_t *mp; /* mount point structure */ + int nullfb; /* true if ap->firstblock isn't set */ + int rt; /* true if inode is realtime */ +#ifdef __KERNEL__ + xfs_extlen_t prod=0; /* product factor for allocators */ + xfs_extlen_t ralen=0; /* realtime allocation length */ +#endif + +#define ISLEGAL(x,y) \ + (rt ? \ + (x) < mp->m_sb.sb_rblocks : \ + XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && \ + XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && \ + XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks) + + /* + * Set up variables. + */ + mp = ap->ip->i_mount; + nullfb = ap->firstblock == NULLFSBLOCK; + rt = (ap->ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && ap->userdata; + fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); +#ifdef __KERNEL__ + if (rt) { + xfs_extlen_t extsz; /* file extent size for rt */ + xfs_fileoff_t nexto; /* next file offset */ + xfs_extlen_t orig_alen; /* original ap->alen */ + xfs_fileoff_t orig_end; /* original off+len */ + xfs_fileoff_t orig_off; /* original ap->off */ + xfs_extlen_t mod_off; /* modulus calculations */ + xfs_fileoff_t prevo; /* previous file offset */ + xfs_rtblock_t rtx; /* realtime extent number */ + xfs_extlen_t temp; /* temp for rt calculations */ + + /* + * Set prod to match the realtime extent size. + */ + if (!(extsz = ap->ip->i_d.di_extsize)) + extsz = mp->m_sb.sb_rextsize; + prod = extsz / mp->m_sb.sb_rextsize; + orig_off = ap->off; + orig_alen = ap->alen; + orig_end = orig_off + orig_alen; + /* + * If the file offset is unaligned vs. the extent size + * we need to align it. This will be possible unless + * the file was previously written with a kernel that didn't + * perform this alignment. + */ + mod_off = do_mod(orig_off, extsz); + if (mod_off) { + ap->alen += mod_off; + ap->off -= mod_off; + } + /* + * Same adjustment for the end of the requested area. + */ + if ((temp = (ap->alen % extsz))) + ap->alen += extsz - temp; + /* + * If the previous block overlaps with this proposed allocation + * then move the start forward without adjusting the length. + */ + prevo = + ap->prevp->br_startoff == NULLFILEOFF ? + 0 : + (ap->prevp->br_startoff + + ap->prevp->br_blockcount); + if (ap->off != orig_off && ap->off < prevo) + ap->off = prevo; + /* + * If the next block overlaps with this proposed allocation + * then move the start back without adjusting the length, + * but not before offset 0. + * This may of course make the start overlap previous block, + * and if we hit the offset 0 limit then the next block + * can still overlap too. + */ + nexto = (ap->eof || ap->gotp->br_startoff == NULLFILEOFF) ? + NULLFILEOFF : ap->gotp->br_startoff; + if (!ap->eof && + ap->off + ap->alen != orig_end && + ap->off + ap->alen > nexto) + ap->off = nexto > ap->alen ? nexto - ap->alen : 0; + /* + * If we're now overlapping the next or previous extent that + * means we can't fit an extsz piece in this hole. Just move + * the start forward to the first legal spot and set + * the length so we hit the end. + */ + if ((ap->off != orig_off && ap->off < prevo) || + (ap->off + ap->alen != orig_end && + ap->off + ap->alen > nexto)) { + ap->off = prevo; + ap->alen = nexto - prevo; + } + /* + * If the result isn't a multiple of rtextents we need to + * remove blocks until it is. + */ + if ((temp = (ap->alen % mp->m_sb.sb_rextsize))) { + /* + * We're not covering the original request, or + * we won't be able to once we fix the length. + */ + if (orig_off < ap->off || + orig_end > ap->off + ap->alen || + ap->alen - temp < orig_alen) + return XFS_ERROR(EINVAL); + /* + * Try to fix it by moving the start up. + */ + if (ap->off + temp <= orig_off) { + ap->alen -= temp; + ap->off += temp; + } + /* + * Try to fix it by moving the end in. + */ + else if (ap->off + ap->alen - temp >= orig_end) + ap->alen -= temp; + /* + * Set the start to the minimum then trim the length. + */ + else { + ap->alen -= orig_off - ap->off; + ap->off = orig_off; + ap->alen -= ap->alen % mp->m_sb.sb_rextsize; + } + /* + * Result doesn't cover the request, fail it. + */ + if (orig_off < ap->off || orig_end > ap->off + ap->alen) + return XFS_ERROR(EINVAL); + } + ASSERT(ap->alen % mp->m_sb.sb_rextsize == 0); + /* + * If the offset & length are not perfectly aligned + * then kill prod, it will just get us in trouble. + */ + if (do_mod(ap->off, extsz) || ap->alen % extsz) + prod = 1; + /* + * Set ralen to be the actual requested length in rtextents. + */ + ralen = ap->alen / mp->m_sb.sb_rextsize; + /* + * If the old value was close enough to MAXEXTLEN that + * we rounded up to it, cut it back so it's legal again. + * Note that if it's a really large request (bigger than + * MAXEXTLEN), we don't hear about that number, and can't + * adjust the starting point to match it. + */ + if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN) + ralen = MAXEXTLEN / mp->m_sb.sb_rextsize; + /* + * If it's an allocation to an empty file at offset 0, + * pick an extent that will space things out in the rt area. + */ + if (ap->eof && ap->off == 0) { + error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); + if (error) + return error; + ap->rval = rtx * mp->m_sb.sb_rextsize; + } else + ap->rval = 0; + } +#else + if (rt) + ap->rval = 0; +#endif /* __KERNEL__ */ + else if (nullfb) + ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino); + else + ap->rval = ap->firstblock; + /* + * If allocating at eof, and there's a previous real block, + * try to use it's last block as our starting point. + */ + if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF && + !ISNULLSTARTBLOCK(ap->prevp->br_startblock) && + ISLEGAL(ap->prevp->br_startblock + ap->prevp->br_blockcount, + ap->prevp->br_startblock)) { + ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount; + /* + * Adjust for the gap between prevp and us. + */ + adjust = ap->off - + (ap->prevp->br_startoff + ap->prevp->br_blockcount); + if (adjust && + ISLEGAL(ap->rval + adjust, ap->prevp->br_startblock)) + ap->rval += adjust; + } + /* + * If not at eof, then compare the two neighbor blocks. + * Figure out whether either one gives us a good starting point, + * and pick the better one. + */ + else if (!ap->eof) { + xfs_fsblock_t gotbno; /* right side block number */ + xfs_fsblock_t gotdiff=0; /* right side difference */ + xfs_fsblock_t prevbno; /* left side block number */ + xfs_fsblock_t prevdiff=0; /* left side difference */ + + /* + * If there's a previous (left) block, select a requested + * start block based on it. + */ + if (ap->prevp->br_startoff != NULLFILEOFF && + !ISNULLSTARTBLOCK(ap->prevp->br_startblock) && + (prevbno = ap->prevp->br_startblock + + ap->prevp->br_blockcount) && + ISLEGAL(prevbno, ap->prevp->br_startblock)) { + /* + * Calculate gap to end of previous block. + */ + adjust = prevdiff = ap->off - + (ap->prevp->br_startoff + + ap->prevp->br_blockcount); + /* + * Figure the startblock based on the previous block's + * end and the gap size. + * Heuristic! + * If the gap is large relative to the piece we're + * allocating, or using it gives us an illegal block + * number, then just use the end of the previous block. + */ + if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->alen && + ISLEGAL(prevbno + prevdiff, + ap->prevp->br_startblock)) + prevbno += adjust; + else + prevdiff += adjust; + /* + * If the firstblock forbids it, can't use it, + * must use default. + */ + if (!rt && !nullfb && + XFS_FSB_TO_AGNO(mp, prevbno) != fb_agno) + prevbno = NULLFSBLOCK; + } + /* + * No previous block or can't follow it, just default. + */ + else + prevbno = NULLFSBLOCK; + /* + * If there's a following (right) block, select a requested + * start block based on it. + */ + if (!ISNULLSTARTBLOCK(ap->gotp->br_startblock)) { + /* + * Calculate gap to start of next block. + */ + adjust = gotdiff = ap->gotp->br_startoff - ap->off; + /* + * Figure the startblock based on the next block's + * start and the gap size. + */ + gotbno = ap->gotp->br_startblock; + /* + * Heuristic! + * If the gap is large relative to the piece we're + * allocating, or using it gives us an illegal block + * number, then just use the start of the next block + * offset by our length. + */ + if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->alen && + ISLEGAL(gotbno - gotdiff, gotbno)) + gotbno -= adjust; + else if (ISLEGAL(gotbno - ap->alen, gotbno)) { + gotbno -= ap->alen; + gotdiff += adjust - ap->alen; + } else + gotdiff += adjust; + /* + * If the firstblock forbids it, can't use it, + * must use default. + */ + if (!rt && !nullfb && + XFS_FSB_TO_AGNO(mp, gotbno) != fb_agno) + gotbno = NULLFSBLOCK; + } + /* + * No next block, just default. + */ + else + gotbno = NULLFSBLOCK; + /* + * If both valid, pick the better one, else the only good + * one, else ap->rval is already set (to 0 or the inode block). + */ + if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK) + ap->rval = prevdiff <= gotdiff ? prevbno : gotbno; + else if (prevbno != NULLFSBLOCK) + ap->rval = prevbno; + else if (gotbno != NULLFSBLOCK) + ap->rval = gotbno; + } + /* + * If allowed, use ap->rval; otherwise must use firstblock since + * it's in the right allocation group. + */ + if (nullfb || rt || XFS_FSB_TO_AGNO(mp, ap->rval) == fb_agno) + ; + else + ap->rval = ap->firstblock; + /* + * Realtime allocation, done through xfs_rtallocate_extent. + */ + if (rt) { +#ifndef __KERNEL__ + ASSERT(0); +#else + xfs_rtblock_t rtb; + + atype = ap->rval == 0 ? + XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO; + do_div(ap->rval, mp->m_sb.sb_rextsize); + rtb = ap->rval; + ap->alen = ralen; + if ((error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, ap->alen, + &ralen, atype, ap->wasdel, prod, &rtb))) + return error; + if (rtb == NULLFSBLOCK && prod > 1 && + (error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, + ap->alen, &ralen, atype, + ap->wasdel, 1, &rtb))) + return error; + ap->rval = rtb; + if (ap->rval != NULLFSBLOCK) { + ap->rval *= mp->m_sb.sb_rextsize; + ralen *= mp->m_sb.sb_rextsize; + ap->alen = ralen; + ap->ip->i_d.di_nblocks += ralen; + xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); + if (ap->wasdel) + ap->ip->i_delayed_blks -= ralen; + /* + * Adjust the disk quota also. This was reserved + * earlier. + */ + if (XFS_IS_QUOTA_ON(mp) && + ap->ip->i_ino != mp->m_sb.sb_uquotino && + ap->ip->i_ino != mp->m_sb.sb_gquotino) + xfs_trans_mod_dquot_byino(ap->tp, ap->ip, + ap->wasdel ? + XFS_TRANS_DQ_DELRTBCOUNT : + XFS_TRANS_DQ_RTBCOUNT, + (long)ralen); + } else + ap->alen = 0; +#endif /* __KERNEL__ */ + } + /* + * Normal allocation, done through xfs_alloc_vextent. + */ + else { + xfs_agnumber_t ag; + xfs_alloc_arg_t args; + xfs_extlen_t blen; + xfs_extlen_t delta; + int isaligned; + xfs_extlen_t longest; + xfs_extlen_t need; + xfs_extlen_t nextminlen=0; + int notinit; + xfs_perag_t *pag; + xfs_agnumber_t startag; + int tryagain; + + tryagain = isaligned = 0; + args.tp = ap->tp; + args.mp = mp; + args.fsbno = ap->rval; + args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks); + blen = 0; + if (nullfb) { + args.type = XFS_ALLOCTYPE_START_BNO; + args.total = ap->total; + /* + * Find the longest available space. + * We're going to try for the whole allocation at once. + */ + startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno); + notinit = 0; + down_read(&mp->m_peraglock); + while (blen < ap->alen) { + pag = &mp->m_perag[ag]; + if (!pag->pagf_init && + (error = xfs_alloc_pagf_init(mp, args.tp, + ag, XFS_ALLOC_FLAG_TRYLOCK))) { + up_read(&mp->m_peraglock); + return error; + } + /* + * See xfs_alloc_fix_freelist... + */ + if (pag->pagf_init) { + need = XFS_MIN_FREELIST_PAG(pag, mp); + delta = need > pag->pagf_flcount ? + need - pag->pagf_flcount : 0; + longest = (pag->pagf_longest > delta) ? + (pag->pagf_longest - delta) : + (pag->pagf_flcount > 0 || + pag->pagf_longest > 0); + if (blen < longest) + blen = longest; + } else + notinit = 1; + if (++ag == mp->m_sb.sb_agcount) + ag = 0; + if (ag == startag) + break; + } + up_read(&mp->m_peraglock); + /* + * Since the above loop did a BUF_TRYLOCK, it is + * possible that there is space for this request. + */ + if (notinit || blen < ap->minlen) + args.minlen = ap->minlen; + /* + * If the best seen length is less than the request + * length, use the best as the minimum. + */ + else if (blen < ap->alen) + args.minlen = blen; + /* + * Otherwise we've seen an extent as big as alen, + * use that as the minimum. + */ + else + args.minlen = ap->alen; + } else if (ap->low) { + args.type = XFS_ALLOCTYPE_FIRST_AG; + args.total = args.minlen = ap->minlen; + } else { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.total = ap->total; + args.minlen = ap->minlen; + } + if (ap->ip->i_d.di_extsize) { + args.prod = ap->ip->i_d.di_extsize; + if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod))) + args.mod = (xfs_extlen_t)(args.prod - args.mod); + } else if (mp->m_sb.sb_blocksize >= NBPP) { + args.prod = 1; + args.mod = 0; + } else { + args.prod = NBPP >> mp->m_sb.sb_blocklog; + if ((args.mod = (xfs_extlen_t)(do_mod(ap->off, args.prod)))) + args.mod = (xfs_extlen_t)(args.prod - args.mod); + } + /* + * If we are not low on available data blocks, and the + * underlying logical volume manager is a stripe, and + * the file offset is zero then try to allocate data + * blocks on stripe unit boundary. + * NOTE: ap->aeof is only set if the allocation length + * is >= the stripe unit and the allocation offset is + * at the end of file. + */ + if (!ap->low && ap->aeof) { + if (!ap->off) { + args.alignment = mp->m_dalign; + atype = args.type; + isaligned = 1; + /* + * Adjust for alignment + */ + if (blen > args.alignment && blen <= ap->alen) + args.minlen = blen - args.alignment; + args.minalignslop = 0; + } else { + /* + * First try an exact bno allocation. + * If it fails then do a near or start bno + * allocation with alignment turned on. + */ + atype = args.type; + tryagain = 1; + args.type = XFS_ALLOCTYPE_THIS_BNO; + args.alignment = 1; + /* + * Compute the minlen+alignment for the + * next case. Set slop so that the value + * of minlen+alignment+slop doesn't go up + * between the calls. + */ + if (blen > mp->m_dalign && blen <= ap->alen) + nextminlen = blen - mp->m_dalign; + else + nextminlen = args.minlen; + if (nextminlen + mp->m_dalign > args.minlen + 1) + args.minalignslop = + nextminlen + mp->m_dalign - + args.minlen - 1; + else + args.minalignslop = 0; + } + } else { + args.alignment = 1; + args.minalignslop = 0; + } + args.minleft = ap->minleft; + args.wasdel = ap->wasdel; + args.isfl = 0; + args.userdata = ap->userdata; + if ((error = xfs_alloc_vextent(&args))) + return error; + if (tryagain && args.fsbno == NULLFSBLOCK) { + /* + * Exact allocation failed. Now try with alignment + * turned on. + */ + args.type = atype; + args.fsbno = ap->rval; + args.alignment = mp->m_dalign; + args.minlen = nextminlen; + args.minalignslop = 0; + isaligned = 1; + if ((error = xfs_alloc_vextent(&args))) + return error; + } + if (isaligned && args.fsbno == NULLFSBLOCK) { + /* + * allocation failed, so turn off alignment and + * try again. + */ + args.type = atype; + args.fsbno = ap->rval; + args.alignment = 0; + if ((error = xfs_alloc_vextent(&args))) + return error; + } + if (args.fsbno == NULLFSBLOCK && nullfb && + args.minlen > ap->minlen) { + args.minlen = ap->minlen; + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = ap->rval; + if ((error = xfs_alloc_vextent(&args))) + return error; + } + if (args.fsbno == NULLFSBLOCK && nullfb) { + args.fsbno = 0; + args.type = XFS_ALLOCTYPE_FIRST_AG; + args.total = ap->minlen; + args.minleft = 0; + if ((error = xfs_alloc_vextent(&args))) + return error; + ap->low = 1; + } + if (args.fsbno != NULLFSBLOCK) { + ap->firstblock = ap->rval = args.fsbno; + ASSERT(nullfb || fb_agno == args.agno || + (ap->low && fb_agno < args.agno)); + ap->alen = args.len; + ap->ip->i_d.di_nblocks += args.len; + xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); + if (ap->wasdel) + ap->ip->i_delayed_blks -= args.len; + /* + * Adjust the disk quota also. This was reserved + * earlier. + */ + if (XFS_IS_QUOTA_ON(mp) && + ap->ip->i_ino != mp->m_sb.sb_uquotino && + ap->ip->i_ino != mp->m_sb.sb_gquotino) + xfs_trans_mod_dquot_byino(ap->tp, ap->ip, + ap->wasdel ? + XFS_TRANS_DQ_DELBCOUNT : + XFS_TRANS_DQ_BCOUNT, + (long)args.len); + } else { + ap->rval = NULLFSBLOCK; + ap->alen = 0; + } + } + return 0; +#undef ISLEGAL +} + +/* + * Transform a btree format file with only one leaf node, where the + * extents list will fit in the inode, into an extents format file. + * Since the extent list is already in-core, all we have to do is + * give up the space for the btree root and pitch the leaf block. + */ +STATIC int /* error */ +xfs_bmap_btree_to_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_btree_cur_t *cur, /* btree cursor */ + int *logflagsp, /* inode logging flags */ + int whichfork, /* data or attr fork */ + int async) /* xaction can be async */ +{ + /* REFERENCED */ + xfs_bmbt_block_t *cblock;/* child btree block */ + xfs_fsblock_t cbno; /* child block number */ + xfs_buf_t *cbp; /* child block's buffer */ + int error; /* error return value */ + xfs_ifork_t *ifp; /* inode fork data */ + xfs_mount_t *mp; /* mount point structure */ + xfs_bmbt_ptr_t *pp; /* ptr to block address */ + xfs_bmbt_block_t *rblock;/* root btree block */ + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); + rblock = ifp->if_broot; + ASSERT(INT_GET(rblock->bb_level, ARCH_CONVERT) == 1); + ASSERT(INT_GET(rblock->bb_numrecs, ARCH_CONVERT) == 1); + ASSERT(XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes) == 1); + mp = ip->i_mount; + pp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, ifp->if_broot_bytes); + *logflagsp = 0; +#ifdef DEBUG + if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), 1))) + return error; +#endif + cbno = INT_GET(*pp, ARCH_CONVERT); + if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, + XFS_BMAP_BTREE_REF))) + return error; + cblock = XFS_BUF_TO_BMBT_BLOCK(cbp); + if ((error = xfs_btree_check_lblock(cur, cblock, 0, cbp))) + return error; + xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp); + if (!async) + xfs_trans_set_sync(tp); + ip->i_d.di_nblocks--; + if (XFS_IS_QUOTA_ON(mp) && + ip->i_ino != mp->m_sb.sb_uquotino && + ip->i_ino != mp->m_sb.sb_gquotino) + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); + xfs_trans_binval(tp, cbp); + if (cur->bc_bufs[0] == cbp) + cur->bc_bufs[0] = NULL; + xfs_iroot_realloc(ip, -1, whichfork); + ASSERT(ifp->if_broot == NULL); + ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); + return 0; +} + +/* + * Called by xfs_bmapi to update extent list structure and the btree + * after removing space (or undoing a delayed allocation). + */ +STATIC int /* error */ +xfs_bmap_del_extent( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_trans_t *tp, /* current transaction pointer */ + xfs_extnum_t idx, /* extent number to update/delete */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + xfs_btree_cur_t *cur, /* if null, not a btree */ + xfs_bmbt_irec_t *del, /* data to remove from extent list */ + int iflags, /* input flags */ + int *logflagsp, /* inode logging flags */ + int whichfork, /* data or attr fork */ + int rsvd) /* OK to allocate reserved blocks */ +{ + xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ + xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ + xfs_fsblock_t del_endblock=0; /* first block past del */ + xfs_fileoff_t del_endoff; /* first offset past del */ + int delay; /* current block is delayed allocated */ + int do_fx; /* free extent at end of routine */ + xfs_bmbt_rec_t *ep; /* current extent entry pointer */ + int error; /* error return value */ + int flags; /* inode logging flags */ +#ifdef XFS_BMAP_TRACE + static char fname[] = "xfs_bmap_del_extent"; +#endif + xfs_bmbt_irec_t got; /* current extent entry */ + xfs_fileoff_t got_endoff; /* first offset past got */ + int i; /* temp state */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_mount_t *mp; /* mount structure */ + xfs_filblks_t nblks; /* quota/sb block count */ + xfs_bmbt_irec_t new; /* new record to be inserted */ + /* REFERENCED */ + xfs_extnum_t nextents; /* number of extents in list */ + uint qfield; /* quota field to update */ + xfs_filblks_t temp; /* for indirect length calculations */ + xfs_filblks_t temp2; /* for indirect length calculations */ + + XFS_STATS_INC(xfsstats.xs_del_exlist); + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(idx >= 0 && idx < nextents); + ASSERT(del->br_blockcount > 0); + ep = &ifp->if_u1.if_extents[idx]; + xfs_bmbt_get_all(ep, &got); + ASSERT(got.br_startoff <= del->br_startoff); + del_endoff = del->br_startoff + del->br_blockcount; + got_endoff = got.br_startoff + got.br_blockcount; + ASSERT(got_endoff >= del_endoff); + delay = ISNULLSTARTBLOCK(got.br_startblock); + ASSERT(ISNULLSTARTBLOCK(del->br_startblock) == delay); + flags = 0; + qfield = 0; + error = 0; + /* + * If deleting a real allocation, must free up the disk space. + */ + if (!delay) { + flags = XFS_ILOG_CORE; + /* + * Realtime allocation. Free it and record di_nblocks update. + */ + if (whichfork == XFS_DATA_FORK && + (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) { + xfs_fsblock_t bno; + xfs_filblks_t len; + + ASSERT(do_mod(del->br_blockcount, + mp->m_sb.sb_rextsize) == 0); + ASSERT(do_mod(del->br_startblock, + mp->m_sb.sb_rextsize) == 0); + bno = del->br_startblock; + len = del->br_blockcount; + do_div(bno, mp->m_sb.sb_rextsize); + do_div(len, mp->m_sb.sb_rextsize); + if ((error = xfs_rtfree_extent(ip->i_transp, bno, + (xfs_extlen_t)len))) + goto done; + do_fx = 0; + nblks = len * mp->m_sb.sb_rextsize; + if (XFS_IS_QUOTA_ON(mp) && + ip->i_ino != mp->m_sb.sb_uquotino && + ip->i_ino != mp->m_sb.sb_gquotino) + qfield = XFS_TRANS_DQ_RTBCOUNT; + } + /* + * Ordinary allocation. + */ + else { + do_fx = 1; + nblks = del->br_blockcount; + if (XFS_IS_QUOTA_ON(mp) && + ip->i_ino != mp->m_sb.sb_uquotino && + ip->i_ino != mp->m_sb.sb_gquotino) + qfield = XFS_TRANS_DQ_BCOUNT; + } + /* + * Set up del_endblock and cur for later. + */ + del_endblock = del->br_startblock + del->br_blockcount; + if (cur) { + if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff, + got.br_startblock, got.br_blockcount, + &i))) + goto done; + ASSERT(i == 1); + } + da_old = da_new = 0; + } else { + da_old = STARTBLOCKVAL(got.br_startblock); + da_new = 0; + nblks = 0; + do_fx = 0; + } + /* + * Set flag value to use in switch statement. + * Left-contig is 2, right-contig is 1. + */ + switch (((got.br_startoff == del->br_startoff) << 1) | + (got_endoff == del_endoff)) { + case 3: + /* + * Matches the whole extent. Delete the entry. + */ + xfs_bmap_trace_delete(fname, "3", ip, idx, 1, whichfork); + xfs_bmap_delete_exlist(ip, idx, 1, whichfork); + ifp->if_lastex = idx; + if (delay) + break; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + flags |= XFS_ILOG_CORE; + if (!cur) { + flags |= XFS_ILOG_FEXT(whichfork); + break; + } + if ((error = xfs_bmbt_delete(cur, iflags & XFS_BMAPI_ASYNC, &i))) + goto done; + ASSERT(i == 1); + break; + + case 2: + /* + * Deleting the first part of the extent. + */ + xfs_bmap_trace_pre_update(fname, "2", ip, idx, whichfork); + xfs_bmbt_set_startoff(ep, del_endoff); + temp = got.br_blockcount - del->br_blockcount; + xfs_bmbt_set_blockcount(ep, temp); + ifp->if_lastex = idx; + if (delay) { + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_old); + xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmap_trace_post_update(fname, "2", ip, idx, + whichfork); + da_new = temp; + break; + } + xfs_bmbt_set_startblock(ep, del_endblock); + xfs_bmap_trace_post_update(fname, "2", ip, idx, whichfork); + if (!cur) { + flags |= XFS_ILOG_FEXT(whichfork); + break; + } + if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock, + got.br_blockcount - del->br_blockcount, + got.br_state))) + goto done; + break; + + case 1: + /* + * Deleting the last part of the extent. + */ + temp = got.br_blockcount - del->br_blockcount; + xfs_bmap_trace_pre_update(fname, "1", ip, idx, whichfork); + xfs_bmbt_set_blockcount(ep, temp); + ifp->if_lastex = idx; + if (delay) { + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_old); + xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmap_trace_post_update(fname, "1", ip, idx, + whichfork); + da_new = temp; + break; + } + xfs_bmap_trace_post_update(fname, "1", ip, idx, whichfork); + if (!cur) { + flags |= XFS_ILOG_FEXT(whichfork); + break; + } + if ((error = xfs_bmbt_update(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount - del->br_blockcount, + got.br_state))) + goto done; + break; + + case 0: + /* + * Deleting the middle of the extent. + */ + temp = del->br_startoff - got.br_startoff; + xfs_bmap_trace_pre_update(fname, "0", ip, idx, whichfork); + xfs_bmbt_set_blockcount(ep, temp); + new.br_startoff = del_endoff; + temp2 = got_endoff - del_endoff; + new.br_blockcount = temp2; + new.br_state = got.br_state; + if (!delay) { + new.br_startblock = del_endblock; + flags |= XFS_ILOG_CORE; + if (cur) { + if ((error = xfs_bmbt_update(cur, + got.br_startoff, + got.br_startblock, temp, + got.br_state))) + goto done; + if ((error = xfs_bmbt_increment(cur, 0, &i))) + goto done; + cur->bc_rec.b = new; + error = xfs_bmbt_insert(cur, &i); + if (error && error != ENOSPC) + goto done; + /* + * If get no-space back from btree insert, + * it tried a split, and we have a zero + * block reservation. + * Fix up our state and return the error. + */ + if (error == ENOSPC) { + /* + * Reset the cursor, don't trust + * it after any insert operation. + */ + if ((error = xfs_bmbt_lookup_eq(cur, + got.br_startoff, + got.br_startblock, + temp, &i))) + goto done; + ASSERT(i == 1); + /* + * Update the btree record back + * to the original value. + */ + if ((error = xfs_bmbt_update(cur, + got.br_startoff, + got.br_startblock, + got.br_blockcount, + got.br_state))) + goto done; + /* + * Reset the extent record back + * to the original value. + */ + xfs_bmbt_set_blockcount(ep, + got.br_blockcount); + flags = 0; + error = XFS_ERROR(ENOSPC); + goto done; + } + ASSERT(i == 1); + } else + flags |= XFS_ILOG_FEXT(whichfork); + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + } else { + ASSERT(whichfork == XFS_DATA_FORK); + temp = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + temp2 = xfs_bmap_worst_indlen(ip, temp2); + new.br_startblock = NULLSTARTBLOCK((int)temp2); + da_new = temp + temp2; + while (da_new > da_old) { + if (temp) { + temp--; + da_new--; + xfs_bmbt_set_startblock(ep, + NULLSTARTBLOCK((int)temp)); + } + if (da_new == da_old) + break; + if (temp2) { + temp2--; + da_new--; + new.br_startblock = + NULLSTARTBLOCK((int)temp2); + } + } + } + xfs_bmap_trace_post_update(fname, "0", ip, idx, whichfork); + xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 1, &new, NULL, + whichfork); + xfs_bmap_insert_exlist(ip, idx + 1, 1, &new, whichfork); + ifp->if_lastex = idx + 1; + break; + } + /* + * If we need to, add to list of extents to delete. + */ + if (do_fx) + xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist, + mp); + /* + * Adjust inode # blocks in the file. + */ + if (nblks) + ip->i_d.di_nblocks -= nblks; + /* + * Adjust quota data. + */ + if (qfield) + xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks); + /* + * Account for change in delayed indirect blocks. + * Nothing to do for disk quota accounting here. + */ + ASSERT(da_old >= da_new); + if (da_old > da_new) + xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int)(da_old - da_new), + rsvd); +done: + *logflagsp = flags; + return error; +} + +/* + * Remove the entry "free" from the free item list. Prev points to the + * previous entry, unless "free" is the head of the list. + */ +STATIC void +xfs_bmap_del_free( + xfs_bmap_free_t *flist, /* free item list header */ + xfs_bmap_free_item_t *prev, /* previous item on list, if any */ + xfs_bmap_free_item_t *free) /* list item to be freed */ +{ + if (prev) + prev->xbfi_next = free->xbfi_next; + else + flist->xbf_first = free->xbfi_next; + flist->xbf_count--; + kmem_zone_free(xfs_bmap_free_item_zone, free); +} + +/* + * Remove count entries from the extents array for inode "ip", starting + * at index "idx". Copies the remaining items down over the deleted ones, + * and gives back the excess memory. + */ +STATIC void +xfs_bmap_delete_exlist( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* starting delete index */ + xfs_extnum_t count, /* count of items to delete */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_t *base; /* base of extent list */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t nextents; /* number of extents in list after */ + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + base = ifp->if_u1.if_extents; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - count; + ovbcopy(&base[idx + count], &base[idx], + (nextents - idx) * sizeof(*base)); + xfs_iext_realloc(ip, -count, whichfork); +} + +/* + * Convert an extents-format file into a btree-format file. + * The new file will have a root block (in the inode) and a single child block. + */ +STATIC int /* error */ +xfs_bmap_extents_to_btree( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first-block-allocated */ + xfs_bmap_free_t *flist, /* blocks freed in xaction */ + xfs_btree_cur_t **curp, /* cursor returned to caller */ + int wasdel, /* converting a delayed alloc */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_block_t *ablock; /* allocated (child) bt block */ + xfs_buf_t *abp; /* buffer for ablock */ + xfs_alloc_arg_t args; /* allocation arguments */ + xfs_bmbt_rec_t *arp; /* child record pointer */ + xfs_bmbt_block_t *block; /* btree root block */ + xfs_btree_cur_t *cur; /* bmap btree cursor */ + xfs_bmbt_rec_t *ep; /* extent list pointer */ + int error; /* error return value */ + xfs_extnum_t i; /* extent list index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_key_t *kp; /* root block key pointer */ + xfs_mount_t *mp; /* mount structure */ + xfs_extnum_t nextents; /* extent list size */ + xfs_bmbt_ptr_t *pp; /* root block address pointer */ + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + /* + * Make space in the inode incore. + */ + xfs_iroot_realloc(ip, 1, whichfork); + ifp->if_flags |= XFS_IFBROOT; + /* + * Fill in the root. + */ + block = ifp->if_broot; + INT_SET(block->bb_magic, ARCH_CONVERT, XFS_BMAP_MAGIC); + INT_SET(block->bb_level, ARCH_CONVERT, 1); + INT_SET(block->bb_numrecs, ARCH_CONVERT, 1); + INT_SET(block->bb_leftsib, ARCH_CONVERT, NULLDFSBNO); + INT_SET(block->bb_rightsib, ARCH_CONVERT, NULLDFSBNO); + /* + * Need a cursor. Can't allocate until bb_level is filled in. + */ + mp = ip->i_mount; + cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip, + whichfork); + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.flist = flist; + cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + /* + * Convert to a btree with two levels, one record in root. + */ + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); + args.tp = tp; + args.mp = mp; + if (*firstblock == NULLFSBLOCK) { + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); + } else if (flist->xbf_low) { + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = *firstblock; + } else { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.fsbno = *firstblock; + } + args.minlen = args.maxlen = args.prod = 1; + args.total = args.minleft = args.alignment = args.mod = args.isfl = + args.minalignslop = 0; + args.wasdel = wasdel; + *logflagsp = 0; + if ((error = xfs_alloc_vextent(&args))) { + xfs_iroot_realloc(ip, -1, whichfork); + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + /* + * Allocation can't fail, the space was reserved. + */ + ASSERT(args.fsbno != NULLFSBLOCK); + ASSERT(*firstblock == NULLFSBLOCK || + args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) || + (flist->xbf_low && + args.agno > XFS_FSB_TO_AGNO(mp, *firstblock))); + *firstblock = cur->bc_private.b.firstblock = args.fsbno; + cur->bc_private.b.allocated++; + ip->i_d.di_nblocks++; + if (XFS_IS_QUOTA_ON(mp) && + ip->i_ino != mp->m_sb.sb_uquotino && + ip->i_ino != mp->m_sb.sb_gquotino) + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); + abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0); + /* + * Fill in the child block. + */ + ablock = XFS_BUF_TO_BMBT_BLOCK(abp); + INT_SET(ablock->bb_magic, ARCH_CONVERT, XFS_BMAP_MAGIC); + INT_ZERO(ablock->bb_level, ARCH_CONVERT); + INT_ZERO(ablock->bb_numrecs, ARCH_CONVERT); + INT_SET(ablock->bb_leftsib, ARCH_CONVERT, NULLDFSBNO); + INT_SET(ablock->bb_rightsib, ARCH_CONVERT, NULLDFSBNO); + arp = XFS_BMAP_REC_IADDR(ablock, 1, cur); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + for (ep = ifp->if_u1.if_extents, i = 0; i < nextents; i++, ep++) { + if (!ISNULLSTARTBLOCK(xfs_bmbt_get_startblock(ep))) { + *arp++ = *ep; + INT_MOD(ablock->bb_numrecs, ARCH_CONVERT, +1); + } + } + ASSERT(INT_GET(ablock->bb_numrecs, ARCH_CONVERT) == XFS_IFORK_NEXTENTS(ip, whichfork)); + /* + * Fill in the root key and pointer. + */ + kp = XFS_BMAP_KEY_IADDR(block, 1, cur); + arp = XFS_BMAP_REC_IADDR(ablock, 1, cur); + INT_SET(kp->br_startoff, ARCH_CONVERT, xfs_bmbt_get_startoff(arp)); + pp = XFS_BMAP_PTR_IADDR(block, 1, cur); + INT_SET(*pp, ARCH_CONVERT, args.fsbno); + /* + * Do all this logging at the end so that + * the root is at the right level. + */ + xfs_bmbt_log_block(cur, abp, XFS_BB_ALL_BITS); + xfs_bmbt_log_recs(cur, abp, 1, INT_GET(ablock->bb_numrecs, ARCH_CONVERT)); + ASSERT(*curp == NULL); + *curp = cur; + *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FBROOT(whichfork); + return 0; +} + +/* + * Insert new item(s) in the extent list for inode "ip". + * Count new items are inserted at offset idx. + */ +STATIC void +xfs_bmap_insert_exlist( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* starting index of new items */ + xfs_extnum_t count, /* number of inserted items */ + xfs_bmbt_irec_t *new, /* items to insert */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_t *base; /* extent list base */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t nextents; /* extent list size */ + xfs_extnum_t to; /* extent list index */ + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + xfs_iext_realloc(ip, count, whichfork); + base = ifp->if_u1.if_extents; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ovbcopy(&base[idx], &base[idx + count], + (nextents - (idx + count)) * sizeof(*base)); + for (to = idx; to < idx + count; to++, new++) + xfs_bmbt_set_all(&base[to], new); +} + +/* + * Convert a local file to an extents file. + * This code is out of bounds for data forks of regular files, + * since the file data needs to get logged so things will stay consistent. + * (The bmap-level manipulations are ok, though). + */ +STATIC int /* error */ +xfs_bmap_local_to_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated in xaction */ + xfs_extlen_t total, /* total blocks needed by transaction */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + int error; /* error return value */ + int flags; /* logging flags returned */ +#ifdef XFS_BMAP_TRACE + static char fname[] = "xfs_bmap_local_to_extents"; +#endif + xfs_ifork_t *ifp; /* inode fork pointer */ + + /* + * We don't want to deal with the case of keeping inode data inline yet. + * So sending the data fork of a regular inode is illegal. + */ + ASSERT(!((ip->i_d.di_mode & IFMT) == IFREG && + whichfork == XFS_DATA_FORK)); + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + flags = 0; + error = 0; + if (ifp->if_bytes) { + xfs_alloc_arg_t args; /* allocation arguments */ + xfs_buf_t *bp; /* buffer for extent list block */ + xfs_bmbt_rec_t *ep; /* extent list pointer */ + + args.tp = tp; + args.mp = ip->i_mount; + ASSERT(ifp->if_flags & XFS_IFINLINE); + /* + * Allocate a block. We know we need only one, since the + * file currently fits in an inode. + */ + if (*firstblock == NULLFSBLOCK) { + args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino); + args.type = XFS_ALLOCTYPE_START_BNO; + } else { + args.fsbno = *firstblock; + args.type = XFS_ALLOCTYPE_NEAR_BNO; + } + args.total = total; + args.mod = args.minleft = args.alignment = args.wasdel = + args.isfl = args.minalignslop = 0; + args.minlen = args.maxlen = args.prod = 1; + if ((error = xfs_alloc_vextent(&args))) + goto done; + /* + * Can't fail, the space was reserved. + */ + ASSERT(args.fsbno != NULLFSBLOCK); + ASSERT(args.len == 1); + *firstblock = args.fsbno; + bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); + bcopy(ifp->if_u1.if_data, (char *)XFS_BUF_PTR(bp), + ifp->if_bytes); + xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); + xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); + xfs_iext_realloc(ip, 1, whichfork); + ep = ifp->if_u1.if_extents; + xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM); + xfs_bmap_trace_post_update(fname, "new", ip, 0, whichfork); + XFS_IFORK_NEXT_SET(ip, whichfork, 1); + ip->i_d.di_nblocks = 1; + if (XFS_IS_QUOTA_ON(args.mp) && + ip->i_ino != args.mp->m_sb.sb_uquotino && + ip->i_ino != args.mp->m_sb.sb_gquotino) + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, + 1L); + flags |= XFS_ILOG_FEXT(whichfork); + } else + ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); + ifp->if_flags &= ~XFS_IFINLINE; + ifp->if_flags |= XFS_IFEXTENTS; + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + flags |= XFS_ILOG_CORE; +done: + *logflagsp = flags; + return error; +} + +xfs_bmbt_rec_t * /* pointer to found extent entry */ +xfs_bmap_do_search_extents( + xfs_bmbt_rec_t *base, /* base of extent list */ + xfs_extnum_t lastx, /* last extent index used */ + xfs_extnum_t nextents, /* extent list size */ + xfs_fileoff_t bno, /* block number searched for */ + int *eofp, /* out: end of file found */ + xfs_extnum_t *lastxp, /* out: last extent index */ + xfs_bmbt_irec_t *gotp, /* out: extent entry found */ + xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ +{ + xfs_bmbt_rec_t *ep; /* extent list entry pointer */ + xfs_bmbt_irec_t got; /* extent list entry, decoded */ + int high; /* high index of binary search */ + int low; /* low index of binary search */ + + if (lastx != NULLEXTNUM && lastx < nextents) + ep = base + lastx; + else + ep = NULL; + prevp->br_startoff = NULLFILEOFF; + if (ep && bno >= (got.br_startoff = xfs_bmbt_get_startoff(ep)) && + bno < got.br_startoff + + (got.br_blockcount = xfs_bmbt_get_blockcount(ep))) + *eofp = 0; + else if (ep && lastx < nextents - 1 && + bno >= (got.br_startoff = xfs_bmbt_get_startoff(ep + 1)) && + bno < got.br_startoff + + (got.br_blockcount = xfs_bmbt_get_blockcount(ep + 1))) { + lastx++; + ep++; + *eofp = 0; + } else if (nextents == 0) + *eofp = 1; + else if (bno == 0 && + (got.br_startoff = xfs_bmbt_get_startoff(base)) == 0) { + ep = base; + lastx = 0; + got.br_blockcount = xfs_bmbt_get_blockcount(ep); + *eofp = 0; + } else { + /* binary search the extents array */ + low = 0; + high = nextents - 1; + while (low <= high) { + XFS_STATS_INC(xfsstats.xs_cmp_exlist); + lastx = (low + high) >> 1; + ep = base + lastx; + got.br_startoff = xfs_bmbt_get_startoff(ep); + got.br_blockcount = xfs_bmbt_get_blockcount(ep); + if (bno < got.br_startoff) + high = lastx - 1; + else if (bno >= got.br_startoff + got.br_blockcount) + low = lastx + 1; + else { + got.br_startblock = xfs_bmbt_get_startblock(ep); + got.br_state = xfs_bmbt_get_state(ep); + *eofp = 0; + *lastxp = lastx; + *gotp = got; + return ep; + } + } + if (bno >= got.br_startoff + got.br_blockcount) { + lastx++; + if (lastx == nextents) { + *eofp = 1; + got.br_startblock = xfs_bmbt_get_startblock(ep); + got.br_state = xfs_bmbt_get_state(ep); + *prevp = got; + ep = NULL; + } else { + *eofp = 0; + xfs_bmbt_get_all(ep, prevp); + ep++; + got.br_startoff = xfs_bmbt_get_startoff(ep); + got.br_blockcount = xfs_bmbt_get_blockcount(ep); + } + } else { + *eofp = 0; + if (ep > base) + xfs_bmbt_get_all(ep - 1, prevp); + } + } + if (ep) { + got.br_startblock = xfs_bmbt_get_startblock(ep); + got.br_state = xfs_bmbt_get_state(ep); + } + *lastxp = lastx; + *gotp = got; + return ep; +} + +/* + * Search the extents list for the inode, for the extent containing bno. + * If bno lies in a hole, point to the next entry. If bno lies past eof, + * *eofp will be set, and *prevp will contain the last entry (null if none). + * Else, *lastxp will be set to the index of the found + * entry; *gotp will contain the entry. + */ +STATIC xfs_bmbt_rec_t * /* pointer to found extent entry */ +xfs_bmap_search_extents( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fileoff_t bno, /* block number searched for */ + int whichfork, /* data or attr fork */ + int *eofp, /* out: end of file found */ + xfs_extnum_t *lastxp, /* out: last extent index */ + xfs_bmbt_irec_t *gotp, /* out: extent entry found */ + xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ +{ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_rec_t *base; /* base of extent list */ + xfs_extnum_t lastx; /* last extent index used */ + xfs_extnum_t nextents; /* extent list size */ + + XFS_STATS_INC(xfsstats.xs_look_exlist); + ifp = XFS_IFORK_PTR(ip, whichfork); + lastx = ifp->if_lastex; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + base = &ifp->if_u1.if_extents[0]; + + return xfs_bmap_do_search_extents(base, lastx, nextents, bno, eofp, + lastxp, gotp, prevp); +} + + +#ifdef XFS_BMAP_TRACE +/* + * Add a bmap trace buffer entry. Base routine for the others. + */ +STATIC void +xfs_bmap_trace_addentry( + int opcode, /* operation */ + char *fname, /* function name */ + char *desc, /* operation description */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index of entry(ies) */ + xfs_extnum_t cnt, /* count of entries, 1 or 2 */ + xfs_bmbt_rec_t *r1, /* first record */ + xfs_bmbt_rec_t *r2, /* second record or null */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_t tr2; + + ASSERT(cnt == 1 || cnt == 2); + ASSERT(r1 != NULL); + if (cnt == 1) { + ASSERT(r2 == NULL); + r2 = &tr2; + bzero(&tr2, sizeof(tr2)); + } else + ASSERT(r2 != NULL); + ktrace_enter(xfs_bmap_trace_buf, + (void *)(__psint_t)(opcode | (whichfork << 16)), + (void *)fname, (void *)desc, (void *)ip, + (void *)(__psint_t)idx, + (void *)(__psint_t)cnt, + (void *)(__psunsigned_t)(ip->i_ino >> 32), + (void *)(__psunsigned_t)(unsigned)ip->i_ino, + (void *)(__psunsigned_t)(INT_GET(r1->l0, ARCH_CONVERT) >> 32), + (void *)(__psunsigned_t)(unsigned)(INT_GET(r1->l0, ARCH_CONVERT)), + (void *)(__psunsigned_t)(INT_GET(r1->l1, ARCH_CONVERT) >> 32), + (void *)(__psunsigned_t)(unsigned)(INT_GET(r1->l1, ARCH_CONVERT)), + (void *)(__psunsigned_t)(INT_GET(r2->l0, ARCH_CONVERT) >> 32), + (void *)(__psunsigned_t)(unsigned)(INT_GET(r2->l0, ARCH_CONVERT)), + (void *)(__psunsigned_t)(INT_GET(r2->l1, ARCH_CONVERT) >> 32), + (void *)(__psunsigned_t)(unsigned)(INT_GET(r2->l1, ARCH_CONVERT)) + ); + ASSERT(ip->i_xtrace); + ktrace_enter(ip->i_xtrace, + (void *)(__psint_t)(opcode | (whichfork << 16)), + (void *)fname, (void *)desc, (void *)ip, + (void *)(__psint_t)idx, + (void *)(__psint_t)cnt, + (void *)(__psunsigned_t)(ip->i_ino >> 32), + (void *)(__psunsigned_t)(unsigned)ip->i_ino, + (void *)(__psunsigned_t)(INT_GET(r1->l0, ARCH_CONVERT) >> 32), + (void *)(__psunsigned_t)(unsigned)(INT_GET(r1->l0, ARCH_CONVERT)), + (void *)(__psunsigned_t)(INT_GET(r1->l1, ARCH_CONVERT) >> 32), + (void *)(__psunsigned_t)(unsigned)(INT_GET(r1->l1, ARCH_CONVERT)), + (void *)(__psunsigned_t)(INT_GET(r2->l0, ARCH_CONVERT) >> 32), + (void *)(__psunsigned_t)(unsigned)(INT_GET(r2->l0, ARCH_CONVERT)), + (void *)(__psunsigned_t)(INT_GET(r2->l1, ARCH_CONVERT) >> 32), + (void *)(__psunsigned_t)(unsigned)(INT_GET(r2->l1, ARCH_CONVERT)) + ); +} + +/* + * Add bmap trace entry prior to a call to xfs_bmap_delete_exlist. + */ +STATIC void +xfs_bmap_trace_delete( + char *fname, /* function name */ + char *desc, /* operation description */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index of entry(entries) deleted */ + xfs_extnum_t cnt, /* count of entries deleted, 1 or 2 */ + int whichfork) /* data or attr fork */ +{ + xfs_ifork_t *ifp; /* inode fork pointer */ + + ifp = XFS_IFORK_PTR(ip, whichfork); + xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_DELETE, fname, desc, ip, idx, + cnt, &ifp->if_u1.if_extents[idx], + cnt == 2 ? &ifp->if_u1.if_extents[idx + 1] : NULL, + whichfork); +} + +/* + * Add bmap trace entry prior to a call to xfs_bmap_insert_exlist, or + * reading in the extents list from the disk (in the btree). + */ +STATIC void +xfs_bmap_trace_insert( + char *fname, /* function name */ + char *desc, /* operation description */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index of entry(entries) inserted */ + xfs_extnum_t cnt, /* count of entries inserted, 1 or 2 */ + xfs_bmbt_irec_t *r1, /* inserted record 1 */ + xfs_bmbt_irec_t *r2, /* inserted record 2 or null */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_t tr1; /* compressed record 1 */ + xfs_bmbt_rec_t tr2; /* compressed record 2 if needed */ + + xfs_bmbt_set_all(&tr1, r1); + if (cnt == 2) { + ASSERT(r2 != NULL); + xfs_bmbt_set_all(&tr2, r2); + } else { + ASSERT(cnt == 1); + ASSERT(r2 == NULL); + } + xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_INSERT, fname, desc, ip, idx, + cnt, &tr1, cnt == 2 ? &tr2 : NULL, whichfork); +} + +/* + * Add bmap trace entry after updating an extent list entry in place. + */ +STATIC void +xfs_bmap_trace_post_update( + char *fname, /* function name */ + char *desc, /* operation description */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index of entry updated */ + int whichfork) /* data or attr fork */ +{ + xfs_ifork_t *ifp; /* inode fork pointer */ + + ifp = XFS_IFORK_PTR(ip, whichfork); + xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_POST_UP, fname, desc, ip, idx, + 1, &ifp->if_u1.if_extents[idx], NULL, whichfork); +} + +/* + * Add bmap trace entry prior to updating an extent list entry in place. + */ +STATIC void +xfs_bmap_trace_pre_update( + char *fname, /* function name */ + char *desc, /* operation description */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index of entry to be updated */ + int whichfork) /* data or attr fork */ +{ + xfs_ifork_t *ifp; /* inode fork pointer */ + + ifp = XFS_IFORK_PTR(ip, whichfork); + xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_PRE_UP, fname, desc, ip, idx, 1, + &ifp->if_u1.if_extents[idx], NULL, whichfork); +} +#endif /* XFS_BMAP_TRACE */ + +/* + * Compute the worst-case number of indirect blocks that will be used + * for ip's delayed extent of length "len". + */ +STATIC xfs_filblks_t +xfs_bmap_worst_indlen( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_filblks_t len) /* delayed extent length */ +{ + int level; /* btree level number */ + int maxrecs; /* maximum record count at this level */ + xfs_mount_t *mp; /* mount structure */ + xfs_filblks_t rval; /* return value */ + + mp = ip->i_mount; + maxrecs = mp->m_bmap_dmxr[0]; + for (level = 0, rval = 0; + level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); + level++) { + len += maxrecs - 1; + do_div(len, maxrecs); + rval += len; + if (len == 1) + return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - + level - 1; + if (level == 0) + maxrecs = mp->m_bmap_dmxr[1]; + } + return rval; +} + +#if defined(DEBUG) && defined(XFS_RW_TRACE) +STATIC void +xfs_bunmap_trace( + xfs_inode_t *ip, + xfs_fileoff_t bno, + xfs_filblks_t len, + int flags, + inst_t *ra) +{ + if (ip->i_rwtrace == NULL) + return; + ktrace_enter(ip->i_rwtrace, + (void *)(__psint_t)XFS_BUNMAPI, + (void *)ip, + (void *)(__psint_t)((ip->i_d.di_size >> 32) & 0xffffffff), + (void *)(__psint_t)(ip->i_d.di_size & 0xffffffff), + (void *)(__psint_t)(((xfs_dfiloff_t)bno >> 32) & 0xffffffff), + (void *)(__psint_t)((xfs_dfiloff_t)bno & 0xffffffff), + (void *)(__psint_t)len, + (void *)(__psint_t)flags, + (void *)(__psint_t)private.p_cpuid, + (void *)ra, + (void *)0, + (void *)0, + (void *)0, + (void *)0, + (void *)0, + (void *)0); +} +#endif + +/* + * Convert inode from non-attributed to attributed. + * Must not be in a transaction, ip must not be locked. + */ +int /* error code */ +xfs_bmap_add_attrfork( + xfs_inode_t *ip, /* incore inode pointer */ + int rsvd) /* OK to allocated reserved blocks in trans */ +{ + int blks; /* space reservation */ + int committed; /* xaction was committed */ + int error; /* error return value */ + xfs_fsblock_t firstblock; /* 1st block/ag allocated */ + xfs_bmap_free_t flist; /* freed extent list */ + int logflags; /* logging flags */ + xfs_mount_t *mp; /* mount structure */ + unsigned long s; /* spinlock spl value */ + xfs_trans_t *tp; /* transaction pointer */ + + ASSERT(ip->i_df.if_ext_max == + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); + if (XFS_IFORK_Q(ip)) + return 0; + mp = ip->i_mount; + ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); + tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK); + blks = XFS_ADDAFORK_SPACE_RES(mp); + if (rsvd) + tp->t_flags |= XFS_TRANS_RESERVE; + if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT))) + goto error0; + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (XFS_IS_QUOTA_ON(mp)) { + if (rsvd) { + error = xfs_trans_reserve_blkquota_force(tp, ip, blks); + } else { + error = xfs_trans_reserve_blkquota(tp, ip, blks); + } + + if (error) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); + return error; + } + } + if (XFS_IFORK_Q(ip)) + goto error1; + if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { + /* + * For inodes coming from pre-6.2 filesystems. + */ + ASSERT(ip->i_d.di_aformat == 0); + ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + } + ASSERT(ip->i_d.di_anextents == 0); + VN_HOLD(XFS_ITOV(ip)); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_DEV: + ip->i_d.di_forkoff = roundup(sizeof(dev_t), 8) >> 3; + break; + case XFS_DINODE_FMT_UUID: + ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3; + break; + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + ip->i_d.di_forkoff = mp->m_attroffset >> 3; + break; + default: + ASSERT(0); + error = XFS_ERROR(EINVAL); + goto error1; + } + ip->i_df.if_ext_max = + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(ip->i_afp == NULL); + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); + ip->i_afp->if_ext_max = + XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); + ip->i_afp->if_flags = XFS_IFEXTENTS; + logflags = 0; + XFS_BMAP_INIT(&flist, &firstblock); + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_LOCAL: + error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist, + &logflags); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock, + &flist, &logflags); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist, + &logflags); + break; + default: + error = 0; + break; + } + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + if (error) + goto error2; + if (!XFS_SB_VERSION_HASATTR(&mp->m_sb)) { + s = XFS_SB_LOCK(mp); + if (!XFS_SB_VERSION_HASATTR(&mp->m_sb)) { + XFS_SB_VERSION_ADDATTR(&mp->m_sb); + XFS_SB_UNLOCK(mp, s); + xfs_mod_sb(tp, XFS_SB_VERSIONNUM); + } else + XFS_SB_UNLOCK(mp, s); + } + if ((error = xfs_bmap_finish(&tp, &flist, firstblock, &committed))) + goto error2; + error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES, NULL); + ASSERT(ip->i_df.if_ext_max == + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); + return error; +error2: + xfs_bmap_cancel(&flist); +error1: + ASSERT(ismrlocked(&ip->i_lock,MR_UPDATE)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); +error0: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + ASSERT(ip->i_df.if_ext_max == + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); + return error; +} + +/* + * Add the extent to the list of extents to be free at transaction end. + * The list is maintained sorted (by block number). + */ +/* ARGSUSED */ +void +xfs_bmap_add_free( + xfs_fsblock_t bno, /* fs block number of extent */ + xfs_filblks_t len, /* length of extent */ + xfs_bmap_free_t *flist, /* list of extents */ + xfs_mount_t *mp) /* mount point structure */ +{ + xfs_bmap_free_item_t *cur; /* current (next) element */ + xfs_bmap_free_item_t *new; /* new element */ + xfs_bmap_free_item_t *prev; /* previous element */ +#ifdef DEBUG + xfs_agnumber_t agno; + xfs_agblock_t agbno; + + ASSERT(bno != NULLFSBLOCK); + ASSERT(len > 0); + ASSERT(len <= MAXEXTLEN); + ASSERT(!ISNULLSTARTBLOCK(bno)); + agno = XFS_FSB_TO_AGNO(mp, bno); + agbno = XFS_FSB_TO_AGBNO(mp, bno); + ASSERT(agno < mp->m_sb.sb_agcount); + ASSERT(agbno < mp->m_sb.sb_agblocks); + ASSERT(len < mp->m_sb.sb_agblocks); + ASSERT(agbno + len <= mp->m_sb.sb_agblocks); +#endif + ASSERT(xfs_bmap_free_item_zone != NULL); + new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); + new->xbfi_startblock = bno; + new->xbfi_blockcount = (xfs_extlen_t)len; + for (prev = NULL, cur = flist->xbf_first; + cur != NULL; + prev = cur, cur = cur->xbfi_next) { + if (cur->xbfi_startblock >= bno) + break; + } + if (prev) + prev->xbfi_next = new; + else + flist->xbf_first = new; + new->xbfi_next = cur; + flist->xbf_count++; +} + +/* + * Compute and fill in the value of the maximum depth of a bmap btree + * in this filesystem. Done once, during mount. + */ +void +xfs_bmap_compute_maxlevels( + xfs_mount_t *mp, /* file system mount structure */ + int whichfork) /* data or attr fork */ +{ + int level; /* btree level */ + uint maxblocks; /* max blocks at this level */ + uint maxleafents; /* max leaf entries possible */ + int maxrootrecs; /* max records in root block */ + int minleafrecs; /* min records in leaf block */ + int minnoderecs; /* min records in node block */ + int sz; /* root block size */ + + /* + * The maximum number of extents in a file, hence the maximum + * number of leaf entries, is controlled by the type of di_nextents + * (a signed 32-bit number, xfs_extnum_t), or by di_anextents + * (a signed 16-bit number, xfs_aextnum_t). + */ + maxleafents = (whichfork == XFS_DATA_FORK) ? MAXEXTNUM : MAXAEXTNUM; + minleafrecs = mp->m_bmap_dmnr[0]; + minnoderecs = mp->m_bmap_dmnr[1]; + sz = (whichfork == XFS_DATA_FORK) ? + mp->m_attroffset : + mp->m_sb.sb_inodesize - mp->m_attroffset; + maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0); + maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; + for (level = 1; maxblocks > 1; level++) { + if (maxblocks <= maxrootrecs) + maxblocks = 1; + else + maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; + } + mp->m_bm_maxlevels[whichfork] = level; +} + +/* + * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi + * caller. Frees all the extents that need freeing, which must be done + * last due to locking considerations. We never free any extents in + * the first transaction. This is to allow the caller to make the first + * transaction a synchronous one so that the pointers to the data being + * broken in this transaction will be permanent before the data is actually + * freed. This is necessary to prevent blocks from being reallocated + * and written to before the free and reallocation are actually permanent. + * We do not just make the first transaction synchronous here, because + * there are more efficient ways to gain the same protection in some cases + * (see the file truncation code). + * + * Return 1 if the given transaction was committed and a new one + * started, and 0 otherwise in the committed parameter. + */ +/*ARGSUSED*/ +int /* error */ +xfs_bmap_finish( + xfs_trans_t **tp, /* transaction pointer addr */ + xfs_bmap_free_t *flist, /* i/o: list extents to free */ + xfs_fsblock_t firstblock, /* controlled ag for allocs */ + int *committed) /* xact committed or not */ +{ + xfs_efd_log_item_t *efd; /* extent free data */ + xfs_efi_log_item_t *efi; /* extent free intention */ + int error; /* error return value */ + xfs_bmap_free_item_t *free; /* free extent list item */ + unsigned int logres; /* new log reservation */ + unsigned int logcount; /* new log count */ + xfs_mount_t *mp; /* filesystem mount structure */ + xfs_bmap_free_item_t *next; /* next item on free list */ + xfs_trans_t *ntp; /* new transaction pointer */ + + ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); + if (flist->xbf_count == 0) { + *committed = 0; + return 0; + } + ntp = *tp; + efi = xfs_trans_get_efi(ntp, flist->xbf_count); + for (free = flist->xbf_first; free; free = free->xbfi_next) + xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock, + free->xbfi_blockcount); + logres = ntp->t_log_res; + logcount = ntp->t_log_count; + ntp = xfs_trans_dup(*tp); + error = xfs_trans_commit(*tp, 0, NULL); + *tp = ntp; + *committed = 1; + /* + * We have a new transaction, so we should return committed=1, + * even though we're returning an error. + */ + if (error) { + return error; + } + if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES, + logcount))) + return error; + efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count); + for (free = flist->xbf_first; free != NULL; free = next) { + next = free->xbfi_next; + if ((error = xfs_free_extent(ntp, free->xbfi_startblock, + free->xbfi_blockcount))) { + /* + * The bmap free list will be cleaned up at a + * higher level. The EFI will be canceled when + * this transaction is aborted. + * Need to force shutdown here to make sure it + * happens, since this transaction may not be + * dirty yet. + */ + mp = ntp->t_mountp; + if (!XFS_FORCED_SHUTDOWN(mp)) + xfs_force_shutdown(mp, + (error == EFSCORRUPTED) ? + XFS_CORRUPT_INCORE : + XFS_METADATA_IO_ERROR); + return error; + } + xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock, + free->xbfi_blockcount); + xfs_bmap_del_free(flist, NULL, free); + } + return 0; +} + +/* + * Free up any items left in the list. + */ +void +xfs_bmap_cancel( + xfs_bmap_free_t *flist) /* list of bmap_free_items */ +{ + xfs_bmap_free_item_t *free; /* free list item */ + xfs_bmap_free_item_t *next; + + if (flist->xbf_count == 0) + return; + ASSERT(flist->xbf_first != NULL); + for (free = flist->xbf_first; free; free = next) { + next = free->xbfi_next; + xfs_bmap_del_free(flist, NULL, free); + } + ASSERT(flist->xbf_count == 0); +} + +/* + * Returns EINVAL if the specified file is not swappable. + */ +int /* error */ +xfs_bmap_check_swappable( + xfs_inode_t *ip) /* incore inode */ +{ + xfs_bmbt_rec_t *base; /* base of extent array */ + xfs_bmbt_rec_t *ep; /* pointer to an extent entry */ + xfs_fileoff_t end_fsb; /* last block of file within size */ + xfs_bmbt_irec_t ext; /* extent list entry, decoded */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_fileoff_t lastaddr; /* last block number seen */ + xfs_extnum_t nextents; /* number of extent entries */ + int retval = 0; /* return value */ + + xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + + /* + * Check for a zero length file. + */ + if (ip->i_d.di_size == 0) + goto check_done; + + ASSERT(XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_BTREE || + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS); + + ifp = &ip->i_df; + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (retval = xfs_iread_extents(NULL, ip, XFS_DATA_FORK))) + goto check_done; + /* + * Scan extents until the file size is reached. Look for + * holes or unwritten extents, since I/O to these would cause + * a transaction. + */ + end_fsb = XFS_B_TO_FSB(ip->i_mount, ip->i_d.di_size); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + base = &ifp->if_u1.if_extents[0]; + for (lastaddr = 0, ep = base; ep < &base[nextents]; ep++) { + xfs_bmbt_get_all(ep, &ext); + if (lastaddr < ext.br_startoff || + ext.br_state != XFS_EXT_NORM) { + goto error_done; + } + if (end_fsb <= (lastaddr = ext.br_startoff + + ext.br_blockcount)) + goto check_done; + } +error_done: + retval = XFS_ERROR(EINVAL); + + +check_done: + xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + return retval; +} + +/* + * Returns the file-relative block number of the first unused block(s) + * in the file with at least "len" logically contiguous blocks free. + * This is the lowest-address hole if the file has holes, else the first block + * past the end of file. + * Return 0 if the file is currently local (in-inode). + */ +int /* error */ +xfs_bmap_first_unused( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_extlen_t len, /* size of hole to find */ + xfs_fileoff_t *first_unused, /* unused block */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_t *base; /* base of extent array */ + xfs_bmbt_rec_t *ep; /* pointer to an extent entry */ + int error; /* error return value */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_fileoff_t lastaddr; /* last block number seen */ + xfs_fileoff_t lowest; /* lowest useful block */ + xfs_fileoff_t max; /* starting useful block */ + xfs_fileoff_t off; /* offset for this block */ + xfs_extnum_t nextents; /* number of extent entries */ + + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE || + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS || + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + *first_unused = 0; + return 0; + } + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + lowest = *first_unused; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + base = &ifp->if_u1.if_extents[0]; + for (lastaddr = 0, max = lowest, ep = base; + ep < &base[nextents]; + ep++) { + off = xfs_bmbt_get_startoff(ep); + /* + * See if the hole before this extent will work. + */ + if (off >= lowest + len && off - max >= len) { + *first_unused = max; + return 0; + } + lastaddr = off + xfs_bmbt_get_blockcount(ep); + max = XFS_FILEOFF_MAX(lastaddr, lowest); + } + *first_unused = max; + return 0; +} + +/* + * Returns the file-relative block number of the last block + 1 before + * last_block (input value) in the file. + * This is not based on i_size, it is based on the extent list. + * Returns 0 for local files, as they do not have an extent list. + */ +int /* error */ +xfs_bmap_last_before( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_fileoff_t *last_block, /* last block */ + int whichfork) /* data or attr fork */ +{ + xfs_fileoff_t bno; /* input file offset */ + int eof; /* hit end of file */ + xfs_bmbt_rec_t *ep; /* pointer to last extent */ + int error; /* error return value */ + xfs_bmbt_irec_t got; /* current extent value */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t lastx; /* last extent used */ + xfs_bmbt_irec_t prev; /* previous extent value */ + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) + return XFS_ERROR(EIO); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + *last_block = 0; + return 0; + } + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + bno = *last_block - 1; + ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, + &prev); + if (eof || xfs_bmbt_get_startoff(ep) > bno) { + if (prev.br_startoff == NULLFILEOFF) + *last_block = 0; + else + *last_block = prev.br_startoff + prev.br_blockcount; + } + /* + * Otherwise *last_block is already the right answer. + */ + return 0; +} + +/* + * Returns the file-relative block number of the first block past eof in + * the file. This is not based on i_size, it is based on the extent list. + * Returns 0 for local files, as they do not have an extent list. + */ +int /* error */ +xfs_bmap_last_offset( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_fileoff_t *last_block, /* last block */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_t *base; /* base of extent array */ + xfs_bmbt_rec_t *ep; /* pointer to last extent */ + int error; /* error return value */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t nextents; /* number of extent entries */ + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) + return XFS_ERROR(EIO); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + *last_block = 0; + return 0; + } + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + if (!nextents) { + *last_block = 0; + return 0; + } + base = &ifp->if_u1.if_extents[0]; + ASSERT(base != NULL); + ep = &base[nextents - 1]; + *last_block = xfs_bmbt_get_startoff(ep) + xfs_bmbt_get_blockcount(ep); + return 0; +} + +/* + * Returns whether the selected fork of the inode has exactly one + * block or not. For the data fork we check this matches di_size, + * implying the file's range is 0..bsize-1. + */ +int /* 1=>1 block, 0=>otherwise */ +xfs_bmap_one_block( + xfs_inode_t *ip, /* incore inode */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_t *ep; /* ptr to fork's extent */ + xfs_ifork_t *ifp; /* inode fork pointer */ + int rval; /* return value */ + xfs_bmbt_irec_t s; /* internal version of extent */ + +#ifndef DEBUG + if (whichfork == XFS_DATA_FORK) + return ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize; +#endif /* !DEBUG */ + if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) + return 0; + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + return 0; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ep = ifp->if_u1.if_extents; + xfs_bmbt_get_all(ep, &s); + rval = s.br_startoff == 0 && s.br_blockcount == 1; + if (rval && whichfork == XFS_DATA_FORK) + ASSERT(ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize); + return rval; +} + +/* + * Read in the extents to if_extents. + * All inode fields are set up by caller, we just traverse the btree + * and copy the records in. If the file system cannot contain unwritten + * extents, the records are checked for no "state" flags. + */ +int /* error */ +xfs_bmap_read_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_block_t *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_buf_t *bp; /* buffer for "block" */ + int error; /* error return value */ + xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */ +#ifdef XFS_BMAP_TRACE + static char fname[] = "xfs_bmap_read_extents"; +#endif + xfs_extnum_t i; /* index into the extents list */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + xfs_bmbt_ptr_t *pp; /* pointer to block address */ + /* REFERENCED */ + xfs_extnum_t room; /* number of entries there's room for */ + xfs_bmbt_rec_t *trp; /* target record pointer */ + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE : + XFS_EXTFMT_INODE(ip); + block = ifp->if_broot; + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + ASSERT(INT_GET(block->bb_level, ARCH_CONVERT) > 0); + level = INT_GET(block->bb_level, ARCH_CONVERT); + pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes); + ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks); + bno = INT_GET(*pp, ARCH_CONVERT); + /* + * Go down the tree until leaf level is reached, following the first + * pointer (leftmost) at each level. + */ + while (level-- > 0) { + if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF))) + return error; + block = XFS_BUF_TO_BMBT_BLOCK(bp); + XFS_WANT_CORRUPTED_GOTO( + XFS_BMAP_SANITY_CHECK(mp, block, level), + error0); + if (level == 0) + break; + pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block, + 1, mp->m_bmap_dmxr[1]); +#ifndef __KERNEL__ + XFS_WANT_CORRUPTED_GOTO( + XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)), + error0); +#else /* additional, temporary, debugging code */ + if (!(XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)))) { + cmn_err(CE_NOTE, + "xfs_bmap_read_extents: FSB Sanity Check:"); + if (!(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount)) + cmn_err(CE_NOTE, + "bad AG count %d < agcount %d", + XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)), + mp->m_sb.sb_agcount); + if (!(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks)) + cmn_err(CE_NOTE, + "bad AG BNO %d < %d", + XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)), + mp->m_sb.sb_agblocks); + error = XFS_ERROR(EFSCORRUPTED); + goto error0; + } +#endif + bno = INT_GET(*pp, ARCH_CONVERT); + xfs_trans_brelse(tp, bp); + } + /* + * Here with bp and block set to the leftmost leaf node in the tree. + */ + room = ifp->if_bytes / (uint)sizeof(*trp); + trp = ifp->if_u1.if_extents; + i = 0; + /* + * Loop over all leaf nodes. Copy information to the extent list. + */ + for (;;) { + xfs_bmbt_rec_t *frp; + xfs_fsblock_t nextbno; + xfs_extnum_t num_recs; + + + num_recs = INT_GET(block->bb_numrecs, ARCH_CONVERT); + if (i + num_recs > room) { + ASSERT(i + num_recs <= room); + xfs_fs_cmn_err(CE_WARN, ip->i_mount, + "corrupt dinode %Lu, (btree extents). " + "Unmount and run xfs_repair.", + (unsigned long long) ip->i_ino); + goto error0; + } +#ifndef __KERNEL__ + XFS_WANT_CORRUPTED_GOTO( + XFS_BMAP_SANITY_CHECK(mp, block, 0), + error0); +#else /* additional, temporary, debugging code */ + if (!(XFS_BMAP_SANITY_CHECK(mp, block, 0))) { + cmn_err(CE_NOTE, + "xfs_bmap_read_extents: BMAP Sanity Check:"); + if (!(INT_GET(block->bb_magic, ARCH_CONVERT) == XFS_BMAP_MAGIC)) + cmn_err(CE_NOTE, + "bb_magic 0x%x", + INT_GET(block->bb_magic, ARCH_CONVERT)); + if (!(INT_GET(block->bb_level, ARCH_CONVERT) == level)) + cmn_err(CE_NOTE, + "bb_level %d", + INT_GET(block->bb_level, ARCH_CONVERT)); + if (!(INT_GET(block->bb_numrecs, ARCH_CONVERT) > 0)) + cmn_err(CE_NOTE, + "bb_numrecs %d", + INT_GET(block->bb_numrecs, ARCH_CONVERT)); + if (!(INT_GET(block->bb_numrecs, ARCH_CONVERT) <= (mp)->m_bmap_dmxr[(level) != 0])) + cmn_err(CE_NOTE, + "bb_numrecs %d < m_bmap_dmxr[] %d", + INT_GET(block->bb_numrecs, ARCH_CONVERT), + (mp)->m_bmap_dmxr[(level) != 0]); + error = XFS_ERROR(EFSCORRUPTED); + goto error0; + } +#endif + /* + * Read-ahead the next leaf block, if any. + */ + nextbno = INT_GET(block->bb_rightsib, ARCH_CONVERT); + if (nextbno != NULLFSBLOCK) + xfs_btree_reada_bufl(mp, nextbno, 1); + /* + * Copy records into the extent list. + */ + frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, + block, 1, mp->m_bmap_dmxr[0]); + bcopy(frp, trp, num_recs * sizeof(*frp)); + if (exntf == XFS_EXTFMT_NOSTATE) { + /* + * Check all attribute bmap btree records and + * any "older" data bmap btree records for a + * set bit in the "extent flag" position. + */ + if (xfs_check_nostate_extents(trp, num_recs)) { + goto error0; + } + } + trp += num_recs; + i += num_recs; + xfs_trans_brelse(tp, bp); + bno = nextbno; + /* + * If we've reached the end, stop. + */ + if (bno == NULLFSBLOCK) + break; + if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF))) + return error; + block = XFS_BUF_TO_BMBT_BLOCK(bp); + } + ASSERT(i == ifp->if_bytes / (uint)sizeof(*trp)); + ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); + xfs_bmap_trace_exlist(fname, ip, i, whichfork); + return 0; +error0: + xfs_trans_brelse(tp, bp); + return XFS_ERROR(EFSCORRUPTED); +} + +#ifdef XFS_BMAP_TRACE +/* + * Add bmap trace insert entries for all the contents of the extent list. + */ +void +xfs_bmap_trace_exlist( + char *fname, /* function name */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t cnt, /* count of entries in the list */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_t *base; /* base of extent list */ + xfs_bmbt_rec_t *ep; /* current entry in extent list */ + xfs_extnum_t idx; /* extent list entry number */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_irec_t s; /* extent list record */ + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(cnt == ifp->if_bytes / (uint)sizeof(*base)); + base = ifp->if_u1.if_extents; + for (idx = 0, ep = base; idx < cnt; idx++, ep++) { + xfs_bmbt_get_all(ep, &s); + xfs_bmap_trace_insert(fname, "exlist", ip, idx, 1, &s, NULL, + whichfork); + } +} +#endif + +#ifdef DEBUG +/* + * Validate that the bmbt_irecs being returned from bmapi are valid + * given the callers original parameters. Specifically check the + * ranges of the returned irecs to ensure that they only extent beyond + * the given parameters if the XFS_BMAPI_ENTIRE flag was set. + */ +STATIC void +xfs_bmap_validate_ret( + xfs_fileoff_t bno, + xfs_filblks_t len, + int flags, + xfs_bmbt_irec_t *mval, + int nmap, + int ret_nmap) +{ + int i; /* index to map values */ + + ASSERT(ret_nmap <= nmap); + + for (i = 0; i < ret_nmap; i++) { + ASSERT(mval[i].br_blockcount > 0); + if (!(flags & XFS_BMAPI_ENTIRE)) { + ASSERT(mval[i].br_startoff >= bno); + ASSERT(mval[i].br_blockcount <= len); + ASSERT(mval[i].br_startoff + mval[i].br_blockcount <= + bno + len); + } else { + ASSERT(mval[i].br_startoff < bno + len); + ASSERT(mval[i].br_startoff + mval[i].br_blockcount > + bno); + } + ASSERT(i == 0 || + mval[i - 1].br_startoff + mval[i - 1].br_blockcount == + mval[i].br_startoff); + if ((flags & XFS_BMAPI_WRITE) && !(flags & XFS_BMAPI_DELAY)) + ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && + mval[i].br_startblock != HOLESTARTBLOCK); + ASSERT(mval[i].br_state == XFS_EXT_NORM || + mval[i].br_state == XFS_EXT_UNWRITTEN); + } +} +#endif /* DEBUG */ + + +/* + * Map file blocks to filesystem blocks. + * File range is given by the bno/len pair. + * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set) + * into a hole or past eof. + * Only allocates blocks from a single allocation group, + * to avoid locking problems. + * The returned value in "firstblock" from the first call in a transaction + * must be remembered and presented to subsequent calls in "firstblock". + * An upper bound for the number of blocks to be allocated is supplied to + * the first call in "total"; if no allocation group has that many free + * blocks then the call will fail (return NULLFSBLOCK in "firstblock"). + */ +int /* error */ +xfs_bmapi( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting file offs. mapped */ + xfs_filblks_t len, /* length to map in file */ + int flags, /* XFS_BMAPI_... */ + xfs_fsblock_t *firstblock, /* first allocated block + controls a.g. for allocs */ + xfs_extlen_t total, /* total blocks needed */ + xfs_bmbt_irec_t *mval, /* output: map values */ + int *nmap, /* i/o: mval size/count */ + xfs_bmap_free_t *flist) /* i/o: list extents to free */ +{ + xfs_fsblock_t abno; /* allocated block number */ + xfs_extlen_t alen; /* allocated extent length */ + xfs_fileoff_t aoff; /* allocated file offset */ + xfs_bmalloca_t bma; /* args for xfs_bmap_alloc */ + char contig; /* allocation must be one extent */ + xfs_btree_cur_t *cur; /* bmap btree cursor */ + char delay; /* this request is for delayed alloc */ + xfs_fileoff_t end; /* end of mapped file region */ + int eof; /* we've hit the end of extent list */ + xfs_bmbt_rec_t *ep; /* extent list entry pointer */ + int error; /* error return */ + char exact; /* don't do all of wasdelayed extent */ + xfs_bmbt_irec_t got; /* current extent list record */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extlen_t indlen; /* indirect blocks length */ + char inhole; /* current location is hole in file */ + xfs_extnum_t lastx; /* last useful extent number */ + int logflags; /* flags for transaction logging */ + xfs_extlen_t minleft; /* min blocks left after allocation */ + xfs_extlen_t minlen; /* min allocation size */ + xfs_mount_t *mp; /* xfs mount structure */ + int n; /* current extent index */ + int nallocs; /* number of extents alloc\'d */ + xfs_extnum_t nextents; /* number of extents in file */ + xfs_fileoff_t obno; /* old block number (offset) */ + xfs_bmbt_irec_t prev; /* previous extent list record */ + char stateless; /* ignore state flag set */ + int tmp_logflags; /* temp flags holder */ + char trim; /* output trimmed to match range */ + char userdata; /* allocating non-metadata */ + char wasdelay; /* old extent was delayed */ + int whichfork; /* data or attr fork */ + char wr; /* this is a write request */ + char rsvd; /* OK to allocate reserved blocks */ +#ifdef DEBUG + xfs_fileoff_t orig_bno; /* original block number value */ + int orig_flags; /* original flags arg value */ + xfs_filblks_t orig_len; /* original value of len arg */ + xfs_bmbt_irec_t *orig_mval; /* original value of mval */ + int orig_nmap; /* original value of *nmap */ + + orig_bno = bno; + orig_len = len; + orig_flags = flags; + orig_mval = mval; + orig_nmap = *nmap; +#endif + ASSERT(*nmap >= 1); + ASSERT(*nmap <= XFS_BMAP_MAX_NMAP || !(flags & XFS_BMAPI_WRITE)); + whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) { +#ifdef __KERNEL__ /* additional, temporary, debugging code */ + cmn_err(CE_NOTE, + "EFSCORRUPTED returned from file %s line %d", + __FILE__, __LINE__); +#endif + return XFS_ERROR(EFSCORRUPTED); + } + mp = ip->i_mount; + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + if ((wr = (flags & XFS_BMAPI_WRITE)) != 0) + XFS_STATS_INC(xfsstats.xs_blk_mapw); + else + XFS_STATS_INC(xfsstats.xs_blk_mapr); + delay = (flags & XFS_BMAPI_DELAY) != 0; + trim = (flags & XFS_BMAPI_ENTIRE) == 0; + userdata = (flags & XFS_BMAPI_METADATA) == 0; + exact = (flags & XFS_BMAPI_EXACT) != 0; + rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0; + contig = (flags & XFS_BMAPI_CONTIG) != 0; + /* + * stateless is used to combine extents which + * differ only due to the state of the extents. + * This technique is used from xfs_getbmap() + * when the caller does not wish to see the + * separation (which is the default). + * + * This technique is also used when writing a + * buffer which has been partially written, + * (usually by being flushed during a chunkread), + * to ensure one write takes place. This also + * prevents a change in the xfs inode extents at + * this time, intentionally. This change occurs + * on completion of the write operation, in + * xfs_strat_comp(), where the xfs_bmapi() call + * is transactioned, and the extents combined. + */ + stateless = (flags & XFS_BMAPI_IGSTATE) != 0; + if (stateless && wr) /* if writing unwritten space, no */ + wr = 0; /* allocations are allowed */ + ASSERT(wr || !delay); + logflags = 0; + nallocs = 0; + cur = NULL; + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + ASSERT(wr && tp); + if ((error = xfs_bmap_local_to_extents(tp, ip, firstblock, total, + &logflags, whichfork))) + goto error0; + } + if (wr && *firstblock == NULLFSBLOCK) { + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE) + minleft = INT_GET(ifp->if_broot->bb_level, ARCH_CONVERT) + 1; + else + minleft = 1; + } else + minleft = 0; + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + goto error0; + ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, + &prev); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + n = 0; + end = bno + len; + obno = bno; + bma.ip = NULL; + while (bno < end && n < *nmap) { + /* + * Reading past eof, act as though there's a hole + * up to end. + */ + if (eof && !wr) + got.br_startoff = end; + inhole = eof || got.br_startoff > bno; + wasdelay = wr && !inhole && !delay && + ISNULLSTARTBLOCK(got.br_startblock); + /* + * First, deal with the hole before the allocated space + * that we found, if any. + */ + if (wr && (inhole || wasdelay)) { + /* + * For the wasdelay case, we could also just + * allocate the stuff asked for in this bmap call + * but that wouldn't be as good. + */ + if (wasdelay && !exact) { + alen = (xfs_extlen_t)got.br_blockcount; + aoff = got.br_startoff; + if (lastx != NULLEXTNUM && lastx) { + ep = &ifp->if_u1.if_extents[lastx - 1]; + xfs_bmbt_get_all(ep, &prev); + } + } else if (wasdelay) { + alen = (xfs_extlen_t) + XFS_FILBLKS_MIN(len, + (got.br_startoff + + got.br_blockcount) - bno); + aoff = bno; + } else { + alen = (xfs_extlen_t) + XFS_FILBLKS_MIN(len, MAXEXTLEN); + if (!eof) + alen = (xfs_extlen_t) + XFS_FILBLKS_MIN(alen, + got.br_startoff - bno); + aoff = bno; + } + minlen = contig ? alen : 1; + if (delay) { + indlen = (xfs_extlen_t) + xfs_bmap_worst_indlen(ip, alen); + ASSERT(indlen > 0); + /* + * Make a transaction-less quota reservation for + * delayed allocation blocks. This number gets + * adjusted later. + * We return EDQUOT if we haven't allocated + * blks already inside this loop; + */ + if (XFS_IS_QUOTA_ON(ip->i_mount) && + xfs_trans_reserve_blkquota(NULL, ip, + (long)alen)) { + if (n == 0) { + *nmap = 0; + ASSERT(cur == NULL); + return XFS_ERROR(EDQUOT); + } + break; + } + if (xfs_mod_incore_sb(ip->i_mount, + XFS_SBS_FDBLOCKS, + -(alen + indlen), rsvd)) { + if (XFS_IS_QUOTA_ON(ip->i_mount)) + xfs_trans_unreserve_blkquota( + NULL, ip, (long)alen); + break; + } + ip->i_delayed_blks += alen; + abno = NULLSTARTBLOCK(indlen); + } else { + /* + * If first time, allocate and fill in + * once-only bma fields. + */ + if (bma.ip == NULL) { + bma.tp = tp; + bma.ip = ip; + bma.prevp = &prev; + bma.gotp = &got; + bma.total = total; + bma.userdata = 0; + } + /* Indicate if this is the first user data + * in the file, or just any user data. + */ + if (userdata) { + bma.userdata = (aoff == 0) ? + XFS_ALLOC_INITIAL_USER_DATA : + XFS_ALLOC_USERDATA; + } + /* + * Fill in changeable bma fields. + */ + bma.eof = eof; + bma.firstblock = *firstblock; + bma.alen = alen; + bma.off = aoff; + bma.wasdel = wasdelay; + bma.minlen = minlen; + bma.low = flist->xbf_low; + bma.minleft = minleft; + /* + * Only want to do the alignment at the + * eof if it is userdata and allocation length + * is larger than a stripe unit. + */ + if (mp->m_dalign && alen >= mp->m_dalign && + userdata && whichfork == XFS_DATA_FORK) { + if ((error = xfs_bmap_isaeof(ip, aoff, + whichfork, &bma.aeof))) + goto error0; + } else + bma.aeof = 0; + /* + * Call allocator. + */ + if ((error = xfs_bmap_alloc(&bma))) + goto error0; + /* + * Copy out result fields. + */ + abno = bma.rval; + if ((flist->xbf_low = bma.low)) + minleft = 0; + alen = bma.alen; + aoff = bma.off; + ASSERT(*firstblock == NULLFSBLOCK || + XFS_FSB_TO_AGNO(ip->i_mount, + *firstblock) == + XFS_FSB_TO_AGNO(ip->i_mount, + bma.firstblock) || + (flist->xbf_low && + XFS_FSB_TO_AGNO(ip->i_mount, + *firstblock) < + XFS_FSB_TO_AGNO(ip->i_mount, + bma.firstblock))); + *firstblock = bma.firstblock; + if (cur) + cur->bc_private.b.firstblock = + *firstblock; + if (abno == NULLFSBLOCK) + break; + if ((ifp->if_flags & XFS_IFBROOT) && !cur) { + cur = xfs_btree_init_cursor(ip->i_mount, + tp, NULL, 0, XFS_BTNUM_BMAP, + ip, whichfork); + cur->bc_private.b.firstblock = + *firstblock; + cur->bc_private.b.flist = flist; + } + /* + * Bump the number of extents we've allocated + * in this call. + */ + nallocs++; + } + if (cur) + cur->bc_private.b.flags = + wasdelay ? XFS_BTCUR_BPRV_WASDEL : 0; + got.br_startoff = aoff; + got.br_startblock = abno; + got.br_blockcount = alen; + got.br_state = XFS_EXT_NORM; /* assume normal */ + /* + * Determine state of extent, and the filesystem. + * A wasdelay extent has been initialized, so + * shouldn't be flagged as unwritten. + */ + if (wr && XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) { + if (!wasdelay && (flags & XFS_BMAPI_PREALLOC)) + got.br_state = XFS_EXT_UNWRITTEN; + } + error = xfs_bmap_add_extent(ip, lastx, &cur, &got, + firstblock, flist, &tmp_logflags, whichfork, + rsvd); + logflags |= tmp_logflags; + if (error) + goto error0; + lastx = ifp->if_lastex; + ep = &ifp->if_u1.if_extents[lastx]; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + xfs_bmbt_get_all(ep, &got); + ASSERT(got.br_startoff <= aoff); + ASSERT(got.br_startoff + got.br_blockcount >= + aoff + alen); +#ifdef DEBUG + if (delay) { + ASSERT(ISNULLSTARTBLOCK(got.br_startblock)); + ASSERT(STARTBLOCKVAL(got.br_startblock) > 0); + } + ASSERT(got.br_state == XFS_EXT_NORM || + got.br_state == XFS_EXT_UNWRITTEN); +#endif + /* + * Fall down into the found allocated space case. + */ + } else if (inhole) { + /* + * Reading in a hole. + */ + mval->br_startoff = bno; + mval->br_startblock = HOLESTARTBLOCK; + mval->br_blockcount = + XFS_FILBLKS_MIN(len, got.br_startoff - bno); + mval->br_state = XFS_EXT_NORM; + bno += mval->br_blockcount; + len -= mval->br_blockcount; + mval++; + n++; + continue; + } + /* + * Then deal with the allocated space we found. + */ + ASSERT(ep != NULL); + if (trim && (got.br_startoff + got.br_blockcount > obno)) { + if (obno > bno) + bno = obno; + ASSERT((bno >= obno) || (n == 0)); + ASSERT(bno < end); + mval->br_startoff = bno; + if (ISNULLSTARTBLOCK(got.br_startblock)) { + ASSERT(!wr || delay); + mval->br_startblock = DELAYSTARTBLOCK; + } else + mval->br_startblock = + got.br_startblock + + (bno - got.br_startoff); + /* + * Return the minimum of what we got and what we + * asked for for the length. We can use the len + * variable here because it is modified below + * and we could have been there before coming + * here if the first part of the allocation + * didn't overlap what was asked for. + */ + mval->br_blockcount = + XFS_FILBLKS_MIN(end - bno, got.br_blockcount - + (bno - got.br_startoff)); + mval->br_state = got.br_state; + ASSERT(mval->br_blockcount <= len); + } else { + *mval = got; + if (ISNULLSTARTBLOCK(mval->br_startblock)) { + ASSERT(!wr || delay); + mval->br_startblock = DELAYSTARTBLOCK; + } + } + + /* + * Check if writing previously allocated but + * unwritten extents. + */ + if (wr && mval->br_state == XFS_EXT_UNWRITTEN && + ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) { + /* + * Modify (by adding) the state flag, if writing. + */ + ASSERT(mval->br_blockcount <= len); + if ((ifp->if_flags & XFS_IFBROOT) && !cur) { + cur = xfs_btree_init_cursor(ip->i_mount, + tp, NULL, 0, XFS_BTNUM_BMAP, + ip, whichfork); + cur->bc_private.b.firstblock = + *firstblock; + cur->bc_private.b.flist = flist; + } + mval->br_state = XFS_EXT_NORM; + error = xfs_bmap_add_extent(ip, lastx, &cur, mval, + firstblock, flist, &tmp_logflags, whichfork, + rsvd); + logflags |= tmp_logflags; + if (error) + goto error0; + lastx = ifp->if_lastex; + ep = &ifp->if_u1.if_extents[lastx]; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + xfs_bmbt_get_all(ep, &got); + /* + * We may have combined previously unwritten + * space with written space, so generate + * another request. + */ + if (mval->br_blockcount < len) + continue; + } + + ASSERT(!trim || + ((mval->br_startoff + mval->br_blockcount) <= end)); + ASSERT(!trim || (mval->br_blockcount <= len) || + (mval->br_startoff < obno)); + bno = mval->br_startoff + mval->br_blockcount; + len = end - bno; + if (n > 0 && mval->br_startoff == mval[-1].br_startoff) { + ASSERT(mval->br_startblock == mval[-1].br_startblock); + ASSERT(mval->br_blockcount > mval[-1].br_blockcount); + ASSERT(mval->br_state == mval[-1].br_state); + mval[-1].br_blockcount = mval->br_blockcount; + mval[-1].br_state = mval->br_state; + } else if (n > 0 && mval->br_startblock != DELAYSTARTBLOCK && + mval[-1].br_startblock != DELAYSTARTBLOCK && + mval[-1].br_startblock != HOLESTARTBLOCK && + mval->br_startblock == + mval[-1].br_startblock + mval[-1].br_blockcount && + (stateless || mval[-1].br_state == mval->br_state)) { + ASSERT(mval->br_startoff == + mval[-1].br_startoff + mval[-1].br_blockcount); + mval[-1].br_blockcount += mval->br_blockcount; + } else if (n > 0 && + mval->br_startblock == DELAYSTARTBLOCK && + mval[-1].br_startblock == DELAYSTARTBLOCK && + mval->br_startoff == + mval[-1].br_startoff + mval[-1].br_blockcount) { + mval[-1].br_blockcount += mval->br_blockcount; + mval[-1].br_state = mval->br_state; + } else if (!((n == 0) && + ((mval->br_startoff + mval->br_blockcount) <= + obno))) { + mval++; + n++; + } + /* + * If we're done, stop now. Stop when we've allocated + * XFS_BMAP_MAX_NMAP extents no matter what. Otherwise + * the transaction may get too big. + */ + if (bno >= end || n >= *nmap || nallocs >= *nmap) + break; + /* + * Else go on to the next record. + */ + ep++; + lastx++; + if (lastx >= nextents) { + eof = 1; + prev = got; + } else + xfs_bmbt_get_all(ep, &got); + } + ifp->if_lastex = lastx; + *nmap = n; + /* + * Transform from btree to extents, give it cur. + */ + if (tp && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && + XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { + ASSERT(wr && cur); + error = xfs_bmap_btree_to_extents(tp, ip, cur, + &tmp_logflags, whichfork, 0); + logflags |= tmp_logflags; + if (error) + goto error0; + } + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || + XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max); + error = 0; + +error0: + /* + * Log everything. Do this after conversion, there's no point in + * logging the extent list if we've converted to btree format. + */ + if ((logflags & XFS_ILOG_FEXT(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + logflags &= ~XFS_ILOG_FEXT(whichfork); + else if ((logflags & XFS_ILOG_FBROOT(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) + logflags &= ~XFS_ILOG_FBROOT(whichfork); + /* + * Log whatever the flags say, even if error. Otherwise we might miss + * detecting a case where the data is changed, there's an error, + * and it's not logged so we don't shutdown when we should. + */ + if (logflags) { + ASSERT(tp && wr); + xfs_trans_log_inode(tp, ip, logflags); + } + if (cur) { + if (!error) { + ASSERT(*firstblock == NULLFSBLOCK || + XFS_FSB_TO_AGNO(ip->i_mount, *firstblock) == + XFS_FSB_TO_AGNO(ip->i_mount, + cur->bc_private.b.firstblock) || + (flist->xbf_low && + XFS_FSB_TO_AGNO(ip->i_mount, *firstblock) < + XFS_FSB_TO_AGNO(ip->i_mount, + cur->bc_private.b.firstblock))); + *firstblock = cur->bc_private.b.firstblock; + } + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + if (!error) + xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, + orig_nmap, *nmap); + return error; +} + +/* + * Map file blocks to filesystem blocks, simple version. + * One block (extent) only, read-only. + * For flags, only the XFS_BMAPI_ATTRFORK flag is examined. + * For the other flag values, the effect is as if XFS_BMAPI_METADATA + * was set and all the others were clear. + */ +int /* error */ +xfs_bmapi_single( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + int whichfork, /* data or attr fork */ + xfs_fsblock_t *fsb, /* output: mapped block */ + xfs_fileoff_t bno) /* starting file offs. mapped */ +{ + int eof; /* we've hit the end of extent list */ + int error; /* error return */ + xfs_bmbt_irec_t got; /* current extent list record */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t lastx; /* last useful extent number */ + xfs_bmbt_irec_t prev; /* previous extent list record */ + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) { +#ifdef __KERNEL__ /* additional, temporary, debugging code */ + cmn_err(CE_NOTE, + "EFSCORRUPTED returned from file %s line %d", + __FILE__, __LINE__); +#endif + return XFS_ERROR(EFSCORRUPTED); + } + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return XFS_ERROR(EIO); + XFS_STATS_INC(xfsstats.xs_blk_mapr); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + (void)xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, + &prev); + /* + * Reading past eof, act as though there's a hole + * up to end. + */ + if (eof || got.br_startoff > bno) { + *fsb = NULLFSBLOCK; + return 0; + } + ASSERT(!ISNULLSTARTBLOCK(got.br_startblock)); + ASSERT(bno < got.br_startoff + got.br_blockcount); + *fsb = got.br_startblock + (bno - got.br_startoff); + ifp->if_lastex = lastx; + return 0; +} + +/* + * Unmap (remove) blocks from a file. + * If nexts is nonzero then the number of extents to remove is limited to + * that value. If not all extents in the block range can be removed then + * *done is set. + */ +int /* error */ +xfs_bunmapi( + xfs_trans_t *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting offset to unmap */ + xfs_filblks_t len, /* length to unmap in file */ + int flags, /* misc flags */ + xfs_extnum_t nexts, /* number of extents max */ + xfs_fsblock_t *firstblock, /* first allocated block + controls a.g. for allocs */ + xfs_bmap_free_t *flist, /* i/o: list extents to free */ + int *done) /* set if not done yet */ +{ + int async; /* xactions can be async */ + xfs_btree_cur_t *cur; /* bmap btree cursor */ + xfs_bmbt_irec_t del; /* extent being deleted */ + int eof; /* is deleting at eof */ + xfs_bmbt_rec_t *ep; /* extent list entry pointer */ + int error; /* error return value */ + xfs_extnum_t extno; /* extent number in list */ + xfs_bmbt_irec_t got; /* current extent list entry */ + xfs_ifork_t *ifp; /* inode fork pointer */ + int isrt; /* freeing in rt area */ + xfs_extnum_t lastx; /* last extent index used */ + int logflags; /* transaction logging flags */ + xfs_extlen_t mod; /* rt extent offset */ + xfs_mount_t *mp; /* mount structure */ + xfs_extnum_t nextents; /* size of extent list */ + xfs_bmbt_irec_t prev; /* previous extent list entry */ + xfs_fileoff_t start; /* first file offset deleted */ + int tmp_logflags; /* partial logging flags */ + int wasdel; /* was a delayed alloc extent */ + int whichfork; /* data or attribute fork */ + int rsvd; /* OK to allocate reserved blocks */ + xfs_fsblock_t sum; + + xfs_bunmap_trace(ip, bno, len, flags, (inst_t *)__return_address); + whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + ifp = XFS_IFORK_PTR(ip, whichfork); + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) { +#ifdef __KERNEL__ /* additional, temporary, debugging code */ + cmn_err(CE_NOTE, + "EFSCORRUPTED returned from file %s line %d", + __FILE__, __LINE__); +#endif + return XFS_ERROR(EFSCORRUPTED); + } + mp = ip->i_mount; + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + async = flags & XFS_BMAPI_ASYNC; + rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0; + ASSERT(len > 0); + ASSERT(nexts >= 0); + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *done = 1; + return 0; + } + XFS_STATS_INC(xfsstats.xs_blk_unmap); + isrt = (whichfork == XFS_DATA_FORK) && + (ip->i_d.di_flags & XFS_DIFLAG_REALTIME); + start = bno; + bno = start + len - 1; + ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, + &prev); + /* + * Check to see if the given block number is past the end of the + * file, back up to the last block if so... + */ + if (eof) { + ep = &ifp->if_u1.if_extents[--lastx]; + xfs_bmbt_get_all(ep, &got); + bno = got.br_startoff + got.br_blockcount - 1; + } + logflags = 0; + if (ifp->if_flags & XFS_IFBROOT) { + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); + cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip, + whichfork); + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.flist = flist; + cur->bc_private.b.flags = 0; + } else + cur = NULL; + extno = 0; + while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 && + (nexts == 0 || extno < nexts)) { + /* + * Is the found extent after a hole in which bno lives? + * Just back up to the previous extent, if so. + */ + if (got.br_startoff > bno) { + if (--lastx < 0) + break; + ep--; + xfs_bmbt_get_all(ep, &got); + } + /* + * Is the last block of this extent before the range + * we're supposed to delete? If so, we're done. + */ + bno = XFS_FILEOFF_MIN(bno, + got.br_startoff + got.br_blockcount - 1); + if (bno < start) + break; + /* + * Then deal with the (possibly delayed) allocated space + * we found. + */ + ASSERT(ep != NULL); + del = got; + wasdel = ISNULLSTARTBLOCK(del.br_startblock); + if (got.br_startoff < start) { + del.br_startoff = start; + del.br_blockcount -= start - got.br_startoff; + if (!wasdel) + del.br_startblock += start - got.br_startoff; + } + if (del.br_startoff + del.br_blockcount > bno + 1) + del.br_blockcount = bno + 1 - del.br_startoff; + sum = del.br_startblock + del.br_blockcount; + if (isrt && + (mod = do_mod(sum, mp->m_sb.sb_rextsize))) { + /* + * Realtime extent not lined up at the end. + * The extent could have been split into written + * and unwritten pieces, or we could just be + * unmapping part of it. But we can't really + * get rid of part of a realtime extent. + */ + if (del.br_state == XFS_EXT_UNWRITTEN || + !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) { + /* + * This piece is unwritten, or we're not + * using unwritten extents. Skip over it. + */ + ASSERT(bno >= mod); + bno -= mod > del.br_blockcount ? + del.br_blockcount : mod; + if (bno < got.br_startoff) { + if (--lastx >= 0) + xfs_bmbt_get_all(--ep, &got); + } + continue; + } + /* + * It's written, turn it unwritten. + * This is better than zeroing it. + */ + ASSERT(del.br_state == XFS_EXT_NORM); + ASSERT(xfs_trans_get_block_res(tp) > 0); + /* + * If this spans a realtime extent boundary, + * chop it back to the start of the one we end at. + */ + if (del.br_blockcount > mod) { + del.br_startoff += del.br_blockcount - mod; + del.br_startblock += del.br_blockcount - mod; + del.br_blockcount = mod; + } + del.br_state = XFS_EXT_UNWRITTEN; + error = xfs_bmap_add_extent(ip, lastx, &cur, &del, + firstblock, flist, &logflags, XFS_DATA_FORK, 0); + if (error) + goto error0; + goto nodelete; + } + if (isrt && (mod = do_mod(del.br_startblock, mp->m_sb.sb_rextsize))) { + /* + * Realtime extent is lined up at the end but not + * at the front. We'll get rid of full extents if + * we can. + */ + mod = mp->m_sb.sb_rextsize - mod; + if (del.br_blockcount > mod) { + del.br_blockcount -= mod; + del.br_startoff += mod; + del.br_startblock += mod; + } else if ((del.br_startoff == start && + (del.br_state == XFS_EXT_UNWRITTEN || + xfs_trans_get_block_res(tp) == 0)) || + !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) { + /* + * Can't make it unwritten. There isn't + * a full extent here so just skip it. + */ + ASSERT(bno >= del.br_blockcount); + bno -= del.br_blockcount; + if (bno < got.br_startoff) { + if (--lastx >= 0) + xfs_bmbt_get_all(--ep, &got); + } + continue; + } else if (del.br_state == XFS_EXT_UNWRITTEN) { + /* + * This one is already unwritten. + * It must have a written left neighbor. + * Unwrite the killed part of that one and + * try again. + */ + ASSERT(lastx > 0); + xfs_bmbt_get_all(ep - 1, &prev); + ASSERT(prev.br_state == XFS_EXT_NORM); + ASSERT(!ISNULLSTARTBLOCK(prev.br_startblock)); + ASSERT(del.br_startblock == + prev.br_startblock + prev.br_blockcount); + if (prev.br_startoff < start) { + mod = start - prev.br_startoff; + prev.br_blockcount -= mod; + prev.br_startblock += mod; + prev.br_startoff = start; + } + prev.br_state = XFS_EXT_UNWRITTEN; + error = xfs_bmap_add_extent(ip, lastx - 1, &cur, + &prev, firstblock, flist, &logflags, + XFS_DATA_FORK, 0); + if (error) + goto error0; + goto nodelete; + } else { + ASSERT(del.br_state == XFS_EXT_NORM); + del.br_state = XFS_EXT_UNWRITTEN; + error = xfs_bmap_add_extent(ip, lastx, &cur, + &del, firstblock, flist, &logflags, + XFS_DATA_FORK, 0); + if (error) + goto error0; + goto nodelete; + } + } + if (wasdel) { + ASSERT(STARTBLOCKVAL(del.br_startblock) > 0); + xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, + (int)del.br_blockcount, rsvd); + if (XFS_IS_QUOTA_ON(ip->i_mount)) { + ASSERT(ip->i_ino != mp->m_sb.sb_uquotino); + ASSERT(ip->i_ino != mp->m_sb.sb_gquotino); + if (!isrt) + xfs_trans_unreserve_blkquota(NULL, ip, + (long)del.br_blockcount); + else + xfs_trans_unreserve_rtblkquota(NULL, ip, + (long)del.br_blockcount); + } + ip->i_delayed_blks -= del.br_blockcount; + if (cur) + cur->bc_private.b.flags |= + XFS_BTCUR_BPRV_WASDEL; + } else if (cur) + cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL; + /* + * If it's the case where the directory code is running + * with no block reservation, and the deleted block is in + * the middle of its extent, and the resulting insert + * of an extent would cause transformation to btree format, + * then reject it. The calling code will then swap + * blocks around instead. + * We have to do this now, rather than waiting for the + * conversion to btree format, since the transaction + * will be dirty. + */ + if (!wasdel && xfs_trans_get_block_res(tp) == 0 && + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max && + del.br_startoff > got.br_startoff && + del.br_startoff + del.br_blockcount < + got.br_startoff + got.br_blockcount) { + error = XFS_ERROR(ENOSPC); + goto error0; + } + error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del, + flags, &tmp_logflags, whichfork, rsvd); + logflags |= tmp_logflags; + if (error) + goto error0; + bno = del.br_startoff - 1; +nodelete: + lastx = ifp->if_lastex; + /* + * If not done go on to the next (previous) record. + * Reset ep in case the extents array was re-alloced. + */ + ep = &ifp->if_u1.if_extents[lastx]; + if (bno != (xfs_fileoff_t)-1 && bno >= start) { + if (lastx >= XFS_IFORK_NEXTENTS(ip, whichfork) || + xfs_bmbt_get_startoff(ep) > bno) { + lastx--; + ep--; + } + if (lastx >= 0) + xfs_bmbt_get_all(ep, &got); + extno++; + } + } + ifp->if_lastex = lastx; + *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + /* + * Convert to a btree if necessary. + */ + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) { + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, + &cur, 0, &tmp_logflags, whichfork); + logflags |= tmp_logflags; + if (error) + goto error0; + } + /* + * transform from btree to extents, give it cur + */ + else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && + XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { + ASSERT(cur != NULL); + error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags, + whichfork, async); + logflags |= tmp_logflags; + if (error) + goto error0; + } + /* + * transform from extents to local? + */ + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + error = 0; +error0: + /* + * Log everything. Do this after conversion, there's no point in + * logging the extent list if we've converted to btree format. + */ + if ((logflags & XFS_ILOG_FEXT(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + logflags &= ~XFS_ILOG_FEXT(whichfork); + else if ((logflags & XFS_ILOG_FBROOT(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) + logflags &= ~XFS_ILOG_FBROOT(whichfork); + /* + * Log inode even in the error case, if the transaction + * is dirty we'll need to shut down the filesystem. + */ + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + if (cur) { + if (!error) { + *firstblock = cur->bc_private.b.firstblock; + cur->bc_private.b.allocated = 0; + } + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + return error; +} + +/* + * Fcntl interface to xfs_bmapi. + */ +int /* error code */ +xfs_getbmap( + bhv_desc_t *bdp, /* XFS behavior descriptor*/ + struct getbmap *bmv, /* user bmap structure */ + void *ap, /* pointer to user's array */ + int interface) /* interface flags */ +{ + __int64_t bmvend; /* last block requested */ + int error; /* return value */ + __int64_t fixlen; /* length for -1 case */ + int i; /* extent number */ + xfs_inode_t *ip; /* xfs incore inode pointer */ + vnode_t *vp; /* corresponding vnode */ + int lock; /* lock state */ + xfs_bmbt_irec_t *map; /* buffer for user's data */ + xfs_mount_t *mp; /* file system mount point */ + int nex; /* # of user extents can do */ + int nexleft; /* # of user extents left */ + int subnex; /* # of bmapi's can do */ + int nmap; /* number of map entries */ + struct getbmap out; /* output structure */ + int whichfork; /* data or attr fork */ + int prealloced; /* this is a file with + * preallocated data space */ + int sh_unwritten; /* true, if unwritten */ + /* extents listed seperately */ + int bmapi_flags; /* flags for xfs_bmapi */ + __int32_t oflags; /* getbmapx bmv_oflags field */ + + ip = XFS_BHVTOI(bdp); + vp = BHV_TO_VNODE(bdp); + + whichfork = interface & BMV_IF_ATTRFORK ? + XFS_ATTR_FORK : XFS_DATA_FORK; + sh_unwritten = (interface & BMV_IF_PREALLOC) != 0; + + + /* If the BMV_IF_NO_DMAPI_READ interface bit specified, do not + * generate a DMAPI read event. Otherwise, if the DM_EVENT_READ + * bit is set for the file, generate a read event in order + * that the DMAPI application may do its thing before we return + * the extents. Usually this means restoring user file data to + * regions of the file that look like holes. + * + * The "old behavior" (from XFS_IOC_GETBMAP) is to not specify + * BMV_IF_NO_DMAPI_READ so that read events are generated. + * If this were not true, callers of ioctl( XFS_IOC_GETBMAP ) + * could misinterpret holes in a DMAPI file as true holes, + * when in fact they may represent offline user data. + */ + if ( (interface & BMV_IF_NO_DMAPI_READ) == 0 + && DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) + && whichfork == XFS_DATA_FORK) { + + error = xfs_dm_send_data_event(DM_EVENT_READ, bdp, + 0, 0, 0, NULL); + if (error) + return XFS_ERROR(error); + } + + if (whichfork == XFS_ATTR_FORK) { + if (XFS_IFORK_Q(ip)) { + if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE && + ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) + return XFS_ERROR(EINVAL); + } else if (ip->i_d.di_aformat != 0 && + ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { + cmn_err(CE_NOTE, + "EFSCORRUPTED returned from file %s line %d", + __FILE__, __LINE__); + return XFS_ERROR(EFSCORRUPTED); + } + } else if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_format != XFS_DINODE_FMT_BTREE && + ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) + return XFS_ERROR(EINVAL); + + mp = ip->i_mount; + + if (whichfork == XFS_DATA_FORK) { + if (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC) { + prealloced = 1; + fixlen = XFS_MAX_FILE_OFFSET; + } else { + prealloced = 0; + fixlen = ip->i_d.di_size; + } + } else { + prealloced = 0; + fixlen = 1LL << 32; + } + + if (bmv->bmv_length == -1) { + fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen)); + bmv->bmv_length = MAX( (__int64_t)(fixlen - bmv->bmv_offset), + (__int64_t)0); + } else if (bmv->bmv_length < 0) + return XFS_ERROR(EINVAL); + if (bmv->bmv_length == 0) { + bmv->bmv_entries = 0; + return 0; + } + + nex = bmv->bmv_count - 1; + + if (nex <= 0) + return XFS_ERROR(EINVAL); + + bmvend = bmv->bmv_offset + bmv->bmv_length; + + xfs_ilock(ip, XFS_IOLOCK_SHARED); + + if (whichfork == XFS_DATA_FORK && ip->i_delayed_blks) { + + VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error); + } + + ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0); + + lock = xfs_ilock_map_shared(ip); + + /* + * Don't let nex be bigger than the number of extents + * we can have assuming alternating holes and real extents. + */ + if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1) + nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1; + + bmapi_flags = XFS_BMAPI_AFLAG(whichfork) | + ((sh_unwritten) ? 0 : XFS_BMAPI_IGSTATE); + + subnex = 16; /* XXXjtk - need a #define? */ + + /* + * Allocate enough space to handle "subnex" maps at a time. + */ + map = kmem_alloc(subnex * sizeof(*map), KM_SLEEP); + + bmv->bmv_entries = 0; + + if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0) { + error = 0; + goto unlock_and_return; + } + + nexleft = nex; + + do { + if (nexleft > subnex) + nmap = subnex; + else + nmap = nexleft; + + error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), + XFS_BB_TO_FSB(mp, bmv->bmv_length), + bmapi_flags, NULL, 0, + map, &nmap, NULL); + ASSERT(nmap <= subnex); + + if (error) + goto unlock_and_return; + + for (error = i = 0; i < nmap && nexleft && bmv->bmv_length; i++) { + nexleft--; + + oflags = 0; + + out.bmv_offset = XFS_FSB_TO_BB(mp, map[i].br_startoff); + out.bmv_length = XFS_FSB_TO_BB(mp, map[i].br_blockcount); + + ASSERT(map[i].br_startblock != DELAYSTARTBLOCK); + + if ( prealloced + && map[i].br_startblock == HOLESTARTBLOCK + && out.bmv_offset + out.bmv_length == bmvend) { + /* + * came to hole at end of file + */ + goto unlock_and_return; + } else { + if (map[i].br_startblock == HOLESTARTBLOCK) + out.bmv_block = -1; + else + out.bmv_block = + XFS_FSB_TO_DB(ip, map[i].br_startblock); + + /* return either a getbmap or a getbmapx structure. */ + + if (interface & BMV_IF_EXTENDED) { + struct getbmapx outx; + + GETBMAP_CONVERT(out,outx); + + outx.bmv_oflags = oflags; + outx.bmv_unused1 = outx.bmv_unused2 = 0; + + if (copy_to_user(ap, &outx, sizeof(outx))) { + error = XFS_ERROR(EFAULT); + goto unlock_and_return; + } + } else { + if (copy_to_user(ap, &out, sizeof(out))) { + error = XFS_ERROR(EFAULT); + goto unlock_and_return; + } + } + + bmv->bmv_offset = out.bmv_offset + out.bmv_length; + bmv->bmv_length = MAX( (__int64_t)0, + (__int64_t)(bmvend - bmv->bmv_offset) ); + + bmv->bmv_entries++; + + if (interface & BMV_IF_EXTENDED) + ap = (void *)((struct getbmapx *)ap + 1); + else + ap = (void *)((struct getbmap *)ap + 1); + } + } + } while (nmap && nexleft && bmv->bmv_length); + +unlock_and_return: + xfs_iunlock_map_shared(ip, lock); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + + kmem_free(map, subnex * sizeof(*map)); + + return error; +} + +/* + * Check the last inode extent to determine whether this allocation will result + * in blocks being allocated at the end of the file. When we allocate new data + * blocks at the end of the file which do not start at the previous data block, + * we will try to align the new blocks at stripe unit boundaries. + */ +int /* error */ +xfs_bmap_isaeof( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fileoff_t off, /* file offset in fsblocks */ + int whichfork, /* data or attribute fork */ + char *aeof) /* return value */ +{ + int error; /* error return value */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_rec_t *lastrec; /* extent list entry pointer */ + xfs_extnum_t nextents; /* size of extent list */ + xfs_bmbt_irec_t s; /* expanded extent list entry */ + + ASSERT(whichfork == XFS_DATA_FORK); + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(NULL, ip, whichfork))) + return error; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *aeof = 1; + return 0; + } + /* + * Go to the last extent + */ + lastrec = &ifp->if_u1.if_extents[nextents - 1]; + xfs_bmbt_get_all(lastrec, &s); + /* + * Check we are allocating in the last extent (for delayed allocations) + * or past the last extent for non-delayed allocations. + */ + *aeof = (off >= s.br_startoff && + off < s.br_startoff + s.br_blockcount && + ISNULLSTARTBLOCK(s.br_startblock)) || + off >= s.br_startoff + s.br_blockcount; + return 0; +} + +/* + * Check if the endoff is outside the last extent. If so the caller will grow + * the allocation to a stripe unit boundary. + */ +int /* error */ +xfs_bmap_eof( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fileoff_t endoff, /* file offset in fsblocks */ + int whichfork, /* data or attribute fork */ + int *eof) /* result value */ +{ + xfs_fsblock_t blockcount; /* extent block count */ + int error; /* error return value */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_rec_t *lastrec; /* extent list entry pointer */ + xfs_extnum_t nextents; /* size of extent list */ + xfs_fileoff_t startoff; /* extent starting file offset */ + + ASSERT(whichfork == XFS_DATA_FORK); + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(NULL, ip, whichfork))) + return error; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *eof = 1; + return 0; + } + /* + * Go to the last extent + */ + lastrec = &ifp->if_u1.if_extents[nextents - 1]; + startoff = xfs_bmbt_get_startoff(lastrec); + blockcount = xfs_bmbt_get_blockcount(lastrec); + *eof = endoff >= startoff + blockcount; + return 0; +} + +#ifdef XFSDEBUG +/* + * Check that the extents list for the inode ip is in the right order. + */ +STATIC void +xfs_bmap_check_extents( + xfs_inode_t *ip, /* incore inode pointer */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_t *base; /* base of extents list */ + xfs_bmbt_rec_t *ep; /* current extent entry */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t nextents; /* number of extents in list */ + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + base = ifp->if_u1.if_extents; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + for (ep = base; ep < &base[nextents - 1]; ep++) { + xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep, + (void *)(ep + 1)); + } +} + +STATIC +xfs_buf_t * +xfs_bmap_get_bp( + xfs_btree_cur_t *cur, + xfs_fsblock_t bno) +{ + int i; + xfs_buf_t *bp; + + if (!cur) + return(NULL); + + bp = NULL; + for(i = 0; i < XFS_BTREE_MAXLEVELS; i++) { + bp = cur->bc_bufs[i]; + if (!bp) break; + if (XFS_BUF_ADDR(bp) == bno) + break; /* Found it */ + } + if (i == XFS_BTREE_MAXLEVELS) + bp = NULL; + + if (!bp) { /* Chase down all the log items to see if the bp is there */ + xfs_log_item_chunk_t *licp; + xfs_trans_t *tp; + + tp = cur->bc_tp; + licp = &tp->t_items; + while (!bp && licp != NULL) { + if (XFS_LIC_ARE_ALL_FREE(licp)) { + licp = licp->lic_next; + continue; + } + for (i = 0; i < licp->lic_unused; i++) { + xfs_log_item_desc_t *lidp; + xfs_log_item_t *lip; + xfs_buf_log_item_t *bip; + xfs_buf_t *lbp; + + if (XFS_LIC_ISFREE(licp, i)) { + continue; + } + + lidp = XFS_LIC_SLOT(licp, i); + lip = lidp->lid_item; + if (lip->li_type != XFS_LI_BUF) + continue; + + bip = (xfs_buf_log_item_t *)lip; + lbp = bip->bli_buf; + + if (XFS_BUF_ADDR(lbp) == bno) { + bp = lbp; + break; /* Found it */ + } + } + licp = licp->lic_next; + } + } + return(bp); +} + +void +xfs_check_block( + xfs_bmbt_block_t *block, + xfs_mount_t *mp, + int root, + short sz) +{ + int i, j, dmxr; + xfs_bmbt_ptr_t *pp, *thispa; /* pointer to block address */ + xfs_bmbt_key_t *prevp, *keyp; + + ASSERT(INT_GET(block->bb_level, ARCH_CONVERT) > 0); + + prevp = NULL; + for( i = 1; i <= INT_GET(block->bb_numrecs, ARCH_CONVERT);i++) { + dmxr = mp->m_bmap_dmxr[0]; + + if (root) { + keyp = XFS_BMAP_BROOT_KEY_ADDR(block, i, sz); + } else { + keyp = XFS_BTREE_KEY_ADDR(mp->m_sb.sb_blocksize, + xfs_bmbt, block, i, dmxr); + } + + if (prevp) { + xfs_btree_check_key(XFS_BTNUM_BMAP, prevp, keyp); + } + prevp = keyp; + + /* + * Compare the block numbers to see if there are dups. + */ + + if (root) { + pp = XFS_BMAP_BROOT_PTR_ADDR(block, i, sz); + } else { + pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, + xfs_bmbt, block, i, dmxr); + } + for (j = i+1; j <= INT_GET(block->bb_numrecs, ARCH_CONVERT); j++) { + if (root) { + thispa = XFS_BMAP_BROOT_PTR_ADDR(block, j, sz); + } else { + thispa = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, + xfs_bmbt, block, j, dmxr); + } + if (INT_GET(*thispa, ARCH_CONVERT) == INT_GET(*pp, ARCH_CONVERT)) { + printk("xfs_check_block: thispa(%d) == pp(%d) %Ld\n", + j, i, INT_GET(*thispa, ARCH_CONVERT)); + panic("xfs_check_block: ptrs are equal in node\n"); + } + } + } +} + +/* + * Check that the extents for the inode ip are in the right order in all + * btree leaves. + */ + +STATIC void +xfs_bmap_check_leaf_extents( + xfs_btree_cur_t *cur, /* btree cursor or null */ + xfs_inode_t *ip, /* incore inode pointer */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_block_t *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_buf_t *bp; /* buffer for "block" */ + int error; /* error return value */ + xfs_extnum_t i=0; /* index into the extents list */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + xfs_bmbt_ptr_t *pp; /* pointer to block address */ + xfs_bmbt_rec_t *ep, *lastp; /* extent pointers in block entry */ + int bp_release = 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) { + return; + } + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + block = ifp->if_broot; + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + ASSERT(INT_GET(block->bb_level, ARCH_CONVERT) > 0); + level = INT_GET(block->bb_level, ARCH_CONVERT); + xfs_check_block(block, mp, 1, ifp->if_broot_bytes); + pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes); + ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks); + bno = INT_GET(*pp, ARCH_CONVERT); + /* + * Go down the tree until leaf level is reached, following the first + * pointer (leftmost) at each level. + */ + while (level-- > 0) { + /* See if buf is in cur first */ + bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); + if (bp) { + bp_release = 0; + } else { + bp_release = 1; + } + if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF))) + goto error_norelse; + block = XFS_BUF_TO_BMBT_BLOCK(bp); + XFS_WANT_CORRUPTED_GOTO( + XFS_BMAP_SANITY_CHECK(mp, block, level), + error0); + if (level == 0) + break; + + /* + * Check this block for basic sanity (increasing keys and + * no duplicate blocks). + */ + + xfs_check_block(block, mp, 0, 0); + pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block, + 1, mp->m_bmap_dmxr[1]); + XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)), error0); + bno = INT_GET(*pp, ARCH_CONVERT); + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + } + + /* + * Here with bp and block set to the leftmost leaf node in the tree. + */ + i = 0; + + /* + * Loop over all leaf nodes checking that all extents are in the right order. + */ + lastp = NULL; + for (;;) { + xfs_bmbt_rec_t *frp; + xfs_fsblock_t nextbno; + xfs_extnum_t num_recs; + + + num_recs = INT_GET(block->bb_numrecs, ARCH_CONVERT); + + /* + * Read-ahead the next leaf block, if any. + */ + + nextbno = INT_GET(block->bb_rightsib, ARCH_CONVERT); + + /* + * Check all the extents to make sure they are OK. + * If we had a previous block, the last entry should + * conform with the first entry in this one. + */ + + frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, + block, 1, mp->m_bmap_dmxr[0]); + + for (ep = frp;ep < frp + (num_recs - 1); ep++) { + if (lastp) { + xfs_btree_check_rec(XFS_BTNUM_BMAP, + (void *)lastp, (void *)ep); + } + xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep, + (void *)(ep + 1)); + } + lastp = frp + num_recs - 1; /* For the next iteration */ + + i += num_recs; + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + bno = nextbno; + /* + * If we've reached the end, stop. + */ + if (bno == NULLFSBLOCK) + break; + + bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); + if (bp) { + bp_release = 0; + } else { + bp_release = 1; + } + if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF))) + goto error_norelse; + block = XFS_BUF_TO_BMBT_BLOCK(bp); + } + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + return; + +error0: + printk("at error0\n"); + if (bp_release) + xfs_trans_brelse(NULL, bp); +error_norelse: + printk("xfs_bmap_check_leaf_extents: BAD after btree leaves for %d extents\n", i); + panic("xfs_bmap_check_leaf_extents: CORRUPTED BTREE OR SOMETHING"); + return; +} +#endif + +/* + * Count fsblocks of the given fork. + */ +int /* error */ +xfs_bmap_count_blocks( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + int whichfork, /* data or attr fork */ + int *count) /* out: count of blocks */ +{ + xfs_bmbt_block_t *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + xfs_bmbt_ptr_t *pp; /* pointer to block address */ + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { + if (xfs_bmap_count_leaves(ifp->if_u1.if_extents, + ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t), + count) < 0) { + cmn_err(CE_NOTE, + "EFSCORRUPTED returned from file %s line %d", + __FILE__, __LINE__); + return XFS_ERROR(EFSCORRUPTED); + } + return 0; + } + + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + block = ifp->if_broot; + ASSERT(INT_GET(block->bb_level, ARCH_CONVERT) > 0); + level = INT_GET(block->bb_level, ARCH_CONVERT); + pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes); + ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks); + bno = INT_GET(*pp, ARCH_CONVERT); + + if (xfs_bmap_count_tree(mp, tp, bno, level, count) < 0) { + cmn_err(CE_NOTE, + "EFSCORRUPTED returned from file %s line %d", + __FILE__, __LINE__); + return XFS_ERROR(EFSCORRUPTED); + } + + return 0; +} + +/* + * Recursively walks each level of a btree + * to count total fsblocks is use. + */ +int /* error */ +xfs_bmap_count_tree( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_fsblock_t blockno, /* file system block number */ + int levelin, /* level in btree */ + int *count) /* Count of blocks */ +{ + int error; + xfs_buf_t *bp, *nbp; + int level = levelin; + xfs_bmbt_ptr_t *pp; + xfs_fsblock_t bno = blockno; + xfs_fsblock_t nextbno; + xfs_bmbt_block_t *block, *nextblock; + int numrecs; + xfs_bmbt_rec_t *frp; + + if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF))) + return error; + *count += 1; + block = XFS_BUF_TO_BMBT_BLOCK(bp); + + if (--level) { + /* Not at node above leafs, count this level of nodes */ + nextbno = INT_GET(block->bb_rightsib, ARCH_CONVERT); + while (nextbno != NULLFSBLOCK) { + if ((error = xfs_btree_read_bufl(mp, tp, nextbno, + 0, &nbp, XFS_BMAP_BTREE_REF))) + return error; + *count += 1; + nextblock = XFS_BUF_TO_BMBT_BLOCK(nbp); + nextbno = INT_GET(nextblock->bb_rightsib, ARCH_CONVERT); + xfs_trans_brelse(tp, nbp); + } + + /* Dive to the next level */ + pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, + xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]); + bno = INT_GET(*pp, ARCH_CONVERT); + if ((error = + xfs_bmap_count_tree(mp, tp, bno, level, count)) < 0) { + xfs_trans_brelse(tp, bp); + cmn_err(CE_NOTE, + "EFSCORRUPTED returned from file %s line %d", + __FILE__, __LINE__); + return XFS_ERROR(EFSCORRUPTED); + } + xfs_trans_brelse(tp, bp); + } else { + /* count all level 1 nodes and their leaves */ + for (;;) { + nextbno = INT_GET(block->bb_rightsib, ARCH_CONVERT); + numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT); + frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, + xfs_bmbt, block, 1, mp->m_bmap_dmxr[0]); + if (xfs_bmap_count_leaves(frp, numrecs, count) < 0) { + xfs_trans_brelse(tp, bp); + cmn_err(CE_NOTE, + "EFSCORRUPTED returned from file %s line %d", + __FILE__, __LINE__); + return XFS_ERROR(EFSCORRUPTED); + } + xfs_trans_brelse(tp, bp); + if (nextbno == NULLFSBLOCK) + break; + bno = nextbno; + if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF))) + return error; + *count += 1; + block = XFS_BUF_TO_BMBT_BLOCK(bp); + } + } + return 0; +} + +/* + * Count leaf blocks given a pointer to an extent list. + */ +int +xfs_bmap_count_leaves( + xfs_bmbt_rec_t *frp, + int numrecs, + int *count) +{ + int b; + + for ( b = 1; b <= numrecs; b++, frp++) + *count += xfs_bmbt_get_blockcount(frp); + return 0; +} + diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_bmap.h linux-2.4-xfs/fs/xfs/xfs_bmap.h --- linux-2.4.19/fs/xfs/xfs_bmap.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_bmap.h Wed Jul 10 23:13:51 2002 @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_BMAP_H__ +#define __XFS_BMAP_H__ + +struct getbmap; +struct xfs_bmbt_irec; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; + +/* + * List of extents to be free "later". + * The list is kept sorted on xbf_startblock. + */ +typedef struct xfs_bmap_free_item +{ + xfs_fsblock_t xbfi_startblock;/* starting fs block number */ + xfs_extlen_t xbfi_blockcount;/* number of blocks in extent */ + struct xfs_bmap_free_item *xbfi_next; /* link to next entry */ +} xfs_bmap_free_item_t; + +/* + * Header for free extent list. + */ +typedef struct xfs_bmap_free +{ + xfs_bmap_free_item_t *xbf_first; /* list of to-be-free extents */ + int xbf_count; /* count of items on list */ + int xbf_low; /* kludge: alloc in low mode */ +} xfs_bmap_free_t; + +#define XFS_BMAP_MAX_NMAP 4 + +/* + * Flags for xfs_bmapi + */ +#define XFS_BMAPI_WRITE 0x001 /* write operation: allocate space */ +#define XFS_BMAPI_DELAY 0x002 /* delayed write operation */ +#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */ +#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */ +#define XFS_BMAPI_EXACT 0x010 /* allocate only to spec'd bounds */ +#define XFS_BMAPI_ATTRFORK 0x020 /* use attribute fork not data */ +#define XFS_BMAPI_ASYNC 0x040 /* bunmapi xactions can be async */ +#define XFS_BMAPI_RSVBLOCKS 0x080 /* OK to alloc. reserved data blocks */ +#define XFS_BMAPI_PREALLOC 0x100 /* preallocation op: unwritten space */ +#define XFS_BMAPI_IGSTATE 0x200 /* Ignore state - */ + /* combine contig. space */ +#define XFS_BMAPI_CONTIG 0x400 /* must allocate only one extent */ +#define XFS_BMAPI_DIRECT_IO 0x800 /* Flag from cxfs client, not used + * by xfs directly. Indicates alloc + * request is for direct I/O not + * extent conversion by server */ + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAPI_AFLAG) +int xfs_bmapi_aflag(int w); +#define XFS_BMAPI_AFLAG(w) xfs_bmapi_aflag(w) +#else +#define XFS_BMAPI_AFLAG(w) ((w) == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0) +#endif + +/* + * Special values for xfs_bmbt_irec_t br_startblock field. + */ +#define DELAYSTARTBLOCK ((xfs_fsblock_t)-1LL) +#define HOLESTARTBLOCK ((xfs_fsblock_t)-2LL) + +/* + * Trace operations for bmap extent tracing + */ +#define XFS_BMAP_KTRACE_DELETE 1 +#define XFS_BMAP_KTRACE_INSERT 2 +#define XFS_BMAP_KTRACE_PRE_UP 3 +#define XFS_BMAP_KTRACE_POST_UP 4 + +#define XFS_BMAP_TRACE_SIZE 4096 /* size of global trace buffer */ +#define XFS_BMAP_KTRACE_SIZE 32 /* size of per-inode trace buffer */ + +#if defined(XFS_ALL_TRACE) +#define XFS_BMAP_TRACE +#endif + +#if !defined(DEBUG) +#undef XFS_BMAP_TRACE +#endif + + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_INIT) +void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp); +#define XFS_BMAP_INIT(flp,fbp) xfs_bmap_init(flp,fbp) +#else +#define XFS_BMAP_INIT(flp,fbp) \ + ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \ + (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK) +#endif + +/* + * Argument structure for xfs_bmap_alloc. + */ +typedef struct xfs_bmalloca { + xfs_fsblock_t firstblock; /* i/o first block allocated */ + xfs_fsblock_t rval; /* starting block of new extent */ + xfs_fileoff_t off; /* offset in file filling in */ + struct xfs_trans *tp; /* transaction pointer */ + struct xfs_inode *ip; /* incore inode pointer */ + struct xfs_bmbt_irec *prevp; /* extent before the new one */ + struct xfs_bmbt_irec *gotp; /* extent after, or delayed */ + xfs_extlen_t alen; /* i/o length asked/allocated */ + xfs_extlen_t total; /* total blocks needed for xaction */ + xfs_extlen_t minlen; /* mininum allocation size (blocks) */ + xfs_extlen_t minleft; /* amount must be left after alloc */ + char eof; /* set if allocating past last extent */ + char wasdel; /* replacing a delayed allocation */ + char userdata;/* set if is user data */ + char low; /* low on space, using seq'l ags */ + char aeof; /* allocated space at eof */ +} xfs_bmalloca_t; + +#ifdef __KERNEL__ +/* + * Convert inode from non-attributed to attributed. + * Must not be in a transaction, ip must not be locked. + */ +int /* error code */ +xfs_bmap_add_attrfork( + struct xfs_inode *ip, /* incore inode pointer */ + int rsvd); /* flag for reserved block allocation */ + +/* + * Add the extent to the list of extents to be free at transaction end. + * The list is maintained sorted (by block number). + */ +void +xfs_bmap_add_free( + xfs_fsblock_t bno, /* fs block number of extent */ + xfs_filblks_t len, /* length of extent */ + xfs_bmap_free_t *flist, /* list of extents */ + struct xfs_mount *mp); /* mount point structure */ + +/* + * Routine to clean up the free list data structure when + * an error occurs during a transaction. + */ +void +xfs_bmap_cancel( + xfs_bmap_free_t *flist); /* free list to clean up */ + +/* + * Routine to check if a specified inode is swap capable. + */ +int +xfs_bmap_check_swappable( + struct xfs_inode *ip); /* incore inode */ + +/* + * Compute and fill in the value of the maximum depth of a bmap btree + * in this filesystem. Done once, during mount. + */ +void +xfs_bmap_compute_maxlevels( + struct xfs_mount *mp, /* file system mount structure */ + int whichfork); /* data or attr fork */ + +/* + * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi + * caller. Frees all the extents that need freeing, which must be done + * last due to locking considerations. + * + * Return 1 if the given transaction was committed and a new one allocated, + * and 0 otherwise. + */ +int /* error */ +xfs_bmap_finish( + struct xfs_trans **tp, /* transaction pointer addr */ + xfs_bmap_free_t *flist, /* i/o: list extents to free */ + xfs_fsblock_t firstblock, /* controlled a.g. for allocs */ + int *committed); /* xact committed or not */ + +/* + * Returns the file-relative block number of the first unused block in the file. + * This is the lowest-address hole if the file has holes, else the first block + * past the end of file. + */ +int /* error */ +xfs_bmap_first_unused( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_extlen_t len, /* size of hole to find */ + xfs_fileoff_t *unused, /* unused block num */ + int whichfork); /* data or attr fork */ + +/* + * Returns the file-relative block number of the last block + 1 before + * last_block (input value) in the file. + * This is not based on i_size, it is based on the extent list. + * Returns 0 for local files, as they do not have an extent list. + */ +int /* error */ +xfs_bmap_last_before( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t *last_block, /* last block */ + int whichfork); /* data or attr fork */ + +/* + * Returns the file-relative block number of the first block past eof in + * the file. This is not based on i_size, it is based on the extent list. + * Returns 0 for local files, as they do not have an extent list. + */ +int /* error */ +xfs_bmap_last_offset( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t *unused, /* last block num */ + int whichfork); /* data or attr fork */ + +/* + * Returns whether the selected fork of the inode has exactly one + * block or not. For the data fork we check this matches di_size, + * implying the file's range is 0..bsize-1. + */ +int +xfs_bmap_one_block( + struct xfs_inode *ip, /* incore inode */ + int whichfork); /* data or attr fork */ + +/* + * Read in the extents to iu_extents. + * All inode fields are set up by caller, we just traverse the btree + * and copy the records in. + */ +int /* error */ +xfs_bmap_read_extents( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + int whichfork); /* data or attr fork */ + +#if defined(XFS_BMAP_TRACE) +/* + * Add bmap trace insert entries for all the contents of the extent list. + */ +void +xfs_bmap_trace_exlist( + char *fname, /* function name */ + struct xfs_inode *ip, /* incore inode pointer */ + xfs_extnum_t cnt, /* count of entries in list */ + int whichfork); /* data or attr fork */ +#else +#define xfs_bmap_trace_exlist(f,ip,c,w) +#endif + +/* + * Map file blocks to filesystem blocks. + * File range is given by the bno/len pair. + * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set) + * into a hole or past eof. + * Only allocates blocks from a single allocation group, + * to avoid locking problems. + * The returned value in "firstblock" from the first call in a transaction + * must be remembered and presented to subsequent calls in "firstblock". + * An upper bound for the number of blocks to be allocated is supplied to + * the first call in "total"; if no allocation group has that many free + * blocks then the call will fail (return NULLFSBLOCK in "firstblock"). + */ +int /* error */ +xfs_bmapi( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting file offs. mapped */ + xfs_filblks_t len, /* length to map in file */ + int flags, /* XFS_BMAPI_... */ + xfs_fsblock_t *firstblock, /* first allocated block + controls a.g. for allocs */ + xfs_extlen_t total, /* total blocks needed */ + struct xfs_bmbt_irec *mval, /* output: map values */ + int *nmap, /* i/o: mval size/count */ + xfs_bmap_free_t *flist); /* i/o: list extents to free */ + +/* + * Map file blocks to filesystem blocks, simple version. + * One block only, read-only. + * For flags, only the XFS_BMAPI_ATTRFORK flag is examined. + * For the other flag values, the effect is as if XFS_BMAPI_METADATA + * was set and all the others were clear. + */ +int /* error */ +xfs_bmapi_single( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + int whichfork, /* data or attr fork */ + xfs_fsblock_t *fsb, /* output: mapped block */ + xfs_fileoff_t bno); /* starting file offs. mapped */ + +/* + * Unmap (remove) blocks from a file. + * If nexts is nonzero then the number of extents to remove is limited to + * that value. If not all extents in the block range can be removed then + * *done is set. + */ +int /* error */ +xfs_bunmapi( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting offset to unmap */ + xfs_filblks_t len, /* length to unmap in file */ + int flags, /* XFS_BMAPI_... */ + xfs_extnum_t nexts, /* number of extents max */ + xfs_fsblock_t *firstblock, /* first allocated block + controls a.g. for allocs */ + xfs_bmap_free_t *flist, /* i/o: list extents to free */ + int *done); /* set if not done yet */ + +/* + * Fcntl interface to xfs_bmapi. + */ +int /* error code */ +xfs_getbmap( + bhv_desc_t *bdp, /* XFS behavior descriptor*/ + struct getbmap *bmv, /* user bmap structure */ + void *ap, /* pointer to user's array */ + int iflags); /* interface flags */ + +/* + * Check the last inode extent to determine whether this allocation will result + * in blocks being allocated at the end of the file. When we allocate new data + * blocks at the end of the file which do not start at the previous data block, + * we will try to align the new blocks at stripe unit boundaries. + */ +int +xfs_bmap_isaeof( + struct xfs_inode *ip, + xfs_fileoff_t off, + int whichfork, + char *aeof); + +/* + * Check if the endoff is outside the last extent. If so the caller will grow + * the allocation to a stripe unit boundary + */ +int +xfs_bmap_eof( + struct xfs_inode *ip, + xfs_fileoff_t endoff, + int whichfork, + int *eof); + +/* + * Count fsblocks of the given fork. + */ +int +xfs_bmap_count_blocks( + xfs_trans_t *tp, + xfs_inode_t *ip, + int whichfork, + int *count); + +/* + * Check an extent list, which has just been read, for + * any bit in the extent flag field. + */ +int +xfs_check_nostate_extents( + xfs_bmbt_rec_t *ep, + xfs_extnum_t num); + +#endif /* __KERNEL__ */ + +#endif /* __XFS_BMAP_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_bmap_btree.c linux-2.4-xfs/fs/xfs/xfs_bmap_btree.c --- linux-2.4.19/fs/xfs/xfs_bmap_btree.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_bmap_btree.c Thu Jul 18 13:35:09 2002 @@ -0,0 +1,2634 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include + +#ifdef DEBUG +ktrace_t *xfs_bmbt_trace_buf; +#endif + +/* + * Prototypes for internal btree functions. + */ + + +STATIC int xfs_bmbt_killroot(xfs_btree_cur_t *, int); +STATIC void xfs_bmbt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int); +STATIC void xfs_bmbt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int); +STATIC int xfs_bmbt_lshift(xfs_btree_cur_t *, int, int *); +STATIC int xfs_bmbt_rshift(xfs_btree_cur_t *, int, int *); +STATIC int xfs_bmbt_split(xfs_btree_cur_t *, int, xfs_fsblock_t *, + xfs_bmbt_key_t *, xfs_btree_cur_t **, int *); +STATIC int xfs_bmbt_updkey(xfs_btree_cur_t *, xfs_bmbt_key_t *, int); + + +#if defined(XFS_BMBT_TRACE) +/* + * Add a trace buffer entry for the arguments given to the routine, + * generic form. + */ +STATIC void +xfs_bmbt_trace_enter( + char *func, + xfs_btree_cur_t *cur, + char *s, + int type, + int line, + __psunsigned_t a0, + __psunsigned_t a1, + __psunsigned_t a2, + __psunsigned_t a3, + __psunsigned_t a4, + __psunsigned_t a5, + __psunsigned_t a6, + __psunsigned_t a7, + __psunsigned_t a8, + __psunsigned_t a9, + __psunsigned_t a10) +{ + xfs_inode_t *ip; + int whichfork; + + ip = cur->bc_private.b.ip; + whichfork = cur->bc_private.b.whichfork; + ktrace_enter(xfs_bmbt_trace_buf, + (void *)((__psint_t)type | (whichfork << 8) | (line << 16)), + (void *)func, (void *)s, (void *)ip, (void *)cur, + (void *)a0, (void *)a1, (void *)a2, (void *)a3, + (void *)a4, (void *)a5, (void *)a6, (void *)a7, + (void *)a8, (void *)a9, (void *)a10); + ASSERT(ip->i_btrace); + ktrace_enter(ip->i_btrace, + (void *)((__psint_t)type | (whichfork << 8) | (line << 16)), + (void *)func, (void *)s, (void *)ip, (void *)cur, + (void *)a0, (void *)a1, (void *)a2, (void *)a3, + (void *)a4, (void *)a5, (void *)a6, (void *)a7, + (void *)a8, (void *)a9, (void *)a10); +} +/* + * Add a trace buffer entry for arguments, for a buffer & 1 integer arg. + */ +STATIC void +xfs_bmbt_trace_argbi( + char *func, + xfs_btree_cur_t *cur, + xfs_buf_t *b, + int i, + int line) +{ + xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBI, line, + (__psunsigned_t)b, i, 0, 0, + 0, 0, 0, 0, + 0, 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for a buffer & 2 integer args. + */ +STATIC void +xfs_bmbt_trace_argbii( + char *func, + xfs_btree_cur_t *cur, + xfs_buf_t *b, + int i0, + int i1, + int line) +{ + xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBII, line, + (__psunsigned_t)b, i0, i1, 0, + 0, 0, 0, 0, + 0, 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for 3 block-length args + * and an integer arg. + */ +STATIC void +xfs_bmbt_trace_argfffi( + char *func, + xfs_btree_cur_t *cur, + xfs_dfiloff_t o, + xfs_dfsbno_t b, + xfs_dfilblks_t i, + int j, + int line) +{ + xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGFFFI, line, + o >> 32, (int)o, b >> 32, (int)b, + i >> 32, (int)i, (int)j, 0, + 0, 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for one integer arg. + */ +STATIC void +xfs_bmbt_trace_argi( + char *func, + xfs_btree_cur_t *cur, + int i, + int line) +{ + xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGI, line, + i, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for int, fsblock, key. + */ +STATIC void +xfs_bmbt_trace_argifk( + char *func, + xfs_btree_cur_t *cur, + int i, + xfs_fsblock_t f, + xfs_bmbt_key_t *k, + int line) +{ + xfs_dfsbno_t d; + xfs_dfiloff_t o; + + d = (xfs_dfsbno_t)f; + o = INT_GET(k->br_startoff, ARCH_CONVERT); + xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line, + i, d >> 32, (int)d, o >> 32, + (int)o, 0, 0, 0, + 0, 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for int, fsblock, rec. + */ +STATIC void +xfs_bmbt_trace_argifr( + char *func, + xfs_btree_cur_t *cur, + int i, + xfs_fsblock_t f, + xfs_bmbt_rec_t *r, + int line) +{ + xfs_dfsbno_t b; + xfs_dfilblks_t c; + xfs_dfsbno_t d; + xfs_dfiloff_t o; + xfs_bmbt_irec_t s; + + d = (xfs_dfsbno_t)f; + xfs_bmbt_get_all(r, &s); + o = (xfs_dfiloff_t)s.br_startoff; + b = (xfs_dfsbno_t)s.br_startblock; + c = s.br_blockcount; + xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFR, line, + i, d >> 32, (int)d, o >> 32, + (int)o, b >> 32, (int)b, c >> 32, + (int)c, 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for int, key. + */ +STATIC void +xfs_bmbt_trace_argik( + char *func, + xfs_btree_cur_t *cur, + int i, + xfs_bmbt_key_t *k, + int line) +{ + xfs_dfiloff_t o; + + o = INT_GET(k->br_startoff, ARCH_CONVERT); + xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line, + i, o >> 32, (int)o, 0, + 0, 0, 0, 0, + 0, 0, 0); +} + +/* + * Add a trace buffer entry for the cursor/operation. + */ +STATIC void +xfs_bmbt_trace_cursor( + char *func, + xfs_btree_cur_t *cur, + char *s, + int line) +{ + xfs_bmbt_rec_t r; + + xfs_bmbt_set_all(&r, &cur->bc_rec.b); + xfs_bmbt_trace_enter(func, cur, s, XFS_BMBT_KTRACE_CUR, line, + (cur->bc_nlevels << 24) | (cur->bc_private.b.flags << 16) | + cur->bc_private.b.allocated, +#if BMBT_USE_64 + INT_GET(r.l0, ARCH_CONVERT) >> 32, (int)INT_GET(r.l0, ARCH_CONVERT), INT_GET(r.l1, ARCH_CONVERT) >> 32, (int)INT_GET(r.l1, ARCH_CONVERT), +#else + INT_GET(r.l0, ARCH_CONVERT), INT_GET(r.l1, ARCH_CONVERT), INT_GET(r.l2, ARCH_CONVERT), INT_GET(r.l3, ARCH_CONVERT), +#endif + (unsigned long)cur->bc_bufs[0], (unsigned long)cur->bc_bufs[1], + (unsigned long)cur->bc_bufs[2], (unsigned long)cur->bc_bufs[3], + (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1], + (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]); +} + +#define XFS_BMBT_TRACE_ARGBI(c,b,i) \ + xfs_bmbt_trace_argbi(fname, c, b, i, __LINE__) +#define XFS_BMBT_TRACE_ARGBII(c,b,i,j) \ + xfs_bmbt_trace_argbii(fname, c, b, i, j, __LINE__) +#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j) \ + xfs_bmbt_trace_argfffi(fname, c, o, b, i, j, __LINE__) +#define XFS_BMBT_TRACE_ARGI(c,i) \ + xfs_bmbt_trace_argi(fname, c, i, __LINE__) +#define XFS_BMBT_TRACE_ARGIFK(c,i,f,k) \ + xfs_bmbt_trace_argifk(fname, c, i, f, k, __LINE__) +#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r) \ + xfs_bmbt_trace_argifr(fname, c, i, f, r, __LINE__) +#define XFS_BMBT_TRACE_ARGIK(c,i,k) \ + xfs_bmbt_trace_argik(fname, c, i, k, __LINE__) +#define XFS_BMBT_TRACE_CURSOR(c,s) \ + xfs_bmbt_trace_cursor(fname, c, s, __LINE__) +static char ARGS[] = "args"; +static char ENTRY[] = "entry"; +static char ERROR[] = "error"; +#undef EXIT +static char EXIT[] = "exit"; +#else +#define XFS_BMBT_TRACE_ARGBI(c,b,i) +#define XFS_BMBT_TRACE_ARGBII(c,b,i,j) +#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j) +#define XFS_BMBT_TRACE_ARGI(c,i) +#define XFS_BMBT_TRACE_ARGIFK(c,i,f,k) +#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r) +#define XFS_BMBT_TRACE_ARGIK(c,i,k) +#define XFS_BMBT_TRACE_CURSOR(c,s) +#endif /* XFS_BMBT_TRACE */ + + +/* + * Internal functions. + */ + +/* + * Delete record pointed to by cur/level. + */ +STATIC int /* error */ +xfs_bmbt_delrec( + xfs_btree_cur_t *cur, + int level, + int async, /* deletion can be async */ + int *stat) /* success/failure */ +{ + xfs_bmbt_block_t *block; /* bmap btree block */ + xfs_fsblock_t bno; /* fs-relative block number */ + xfs_buf_t *bp; /* buffer for block */ + int error; /* error return value */ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_delrec"; +#endif + int i; /* loop counter */ + int j; /* temp state */ + xfs_bmbt_key_t key; /* bmap btree key */ + xfs_bmbt_key_t *kp=NULL; /* pointer to bmap btree key */ + xfs_fsblock_t lbno; /* left sibling block number */ + xfs_buf_t *lbp; /* left buffer pointer */ + xfs_bmbt_block_t *left; /* left btree block */ + xfs_bmbt_key_t *lkp; /* left btree key */ + xfs_bmbt_ptr_t *lpp; /* left address pointer */ + int lrecs=0; /* left record count */ + xfs_bmbt_rec_t *lrp; /* left record pointer */ + xfs_mount_t *mp; /* file system mount point */ + xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */ + int ptr; /* key/record index */ + xfs_fsblock_t rbno; /* right sibling block number */ + xfs_buf_t *rbp; /* right buffer pointer */ + xfs_bmbt_block_t *right; /* right btree block */ + xfs_bmbt_key_t *rkp; /* right btree key */ + xfs_bmbt_rec_t *rp; /* pointer to bmap btree rec */ + xfs_bmbt_ptr_t *rpp; /* right address pointer */ + xfs_bmbt_block_t *rrblock; /* right-right btree block */ + xfs_buf_t *rrbp; /* right-right buffer pointer */ + int rrecs=0; /* right record count */ + xfs_bmbt_rec_t *rrp; /* right record pointer */ + xfs_btree_cur_t *tcur; /* temporary btree cursor */ + int numrecs; /* temporary numrec count */ + int numlrecs, numrrecs; + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGI(cur, level); + ptr = cur->bc_ptrs[level]; + tcur = (xfs_btree_cur_t *)0; + if (ptr == 0) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + block = xfs_bmbt_get_block(cur, level, &bp); + numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT); +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, block, level, bp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } +#endif + if (ptr > numrecs) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + XFS_STATS_INC(xfsstats.xs_bmbt_delrec); + if (level > 0) { + kp = XFS_BMAP_KEY_IADDR(block, 1, cur); + pp = XFS_BMAP_PTR_IADDR(block, 1, cur); +#ifdef DEBUG + for (i = ptr; i < numrecs; i++) { + if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + } +#endif + if (ptr < numrecs) { + ovbcopy(&kp[ptr], &kp[ptr - 1], + (numrecs - ptr) * sizeof(*kp)); + ovbcopy(&pp[ptr], &pp[ptr - 1], /* INT_: direct copy */ + (numrecs - ptr) * sizeof(*pp)); + xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs - 1); + xfs_bmbt_log_keys(cur, bp, ptr, numrecs - 1); + } + } else { + rp = XFS_BMAP_REC_IADDR(block, 1, cur); + if (ptr < numrecs) { + ovbcopy(&rp[ptr], &rp[ptr - 1], + (numrecs - ptr) * sizeof(*rp)); + xfs_bmbt_log_recs(cur, bp, ptr, numrecs - 1); + } + if (ptr == 1) { + INT_SET(key.br_startoff, ARCH_CONVERT, xfs_bmbt_get_startoff(rp)); + kp = &key; + } + } + numrecs--; + INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs); + xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS); + /* + * We're at the root level. + * First, shrink the root block in-memory. + * Try to get rid of the next level down. + * If we can't then there's nothing left to do. + */ + if (level == cur->bc_nlevels - 1) { + xfs_iroot_realloc(cur->bc_private.b.ip, -1, + cur->bc_private.b.whichfork); + if ((error = xfs_bmbt_killroot(cur, async))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; + } + if (ptr == 1 && (error = xfs_bmbt_updkey(cur, kp, level + 1))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + if (numrecs >= XFS_BMAP_BLOCK_IMINRECS(level, cur)) { + if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; + } + rbno = INT_GET(block->bb_rightsib, ARCH_CONVERT); + lbno = INT_GET(block->bb_leftsib, ARCH_CONVERT); + /* + * One child of root, need to get a chance to copy its contents + * into the root and delete it. Can't go up to next level, + * there's nothing to delete there. + */ + if (lbno == NULLFSBLOCK && rbno == NULLFSBLOCK && + level == cur->bc_nlevels - 2) { + if ((error = xfs_bmbt_killroot(cur, async))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; + } + ASSERT(rbno != NULLFSBLOCK || lbno != NULLFSBLOCK); + if ((error = xfs_btree_dup_cursor(cur, &tcur))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + bno = NULLFSBLOCK; + if (rbno != NULLFSBLOCK) { + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_bmbt_increment(tcur, level, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + rbp = tcur->bc_bufs[level]; + right = XFS_BUF_TO_BMBT_BLOCK(rbp); +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } +#endif + bno = INT_GET(right->bb_leftsib, ARCH_CONVERT); + if (INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1 >= + XFS_BMAP_BLOCK_IMINRECS(level, cur)) { + if ((error = xfs_bmbt_lshift(tcur, level, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + if (i) { + ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >= + XFS_BMAP_BLOCK_IMINRECS(level, tcur)); + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + tcur = NULL; + if (level > 0) { + if ((error = xfs_bmbt_decrement(cur, + level, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, + ERROR); + goto error0; + } + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; + } + } + rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT); + if (lbno != NULLFSBLOCK) { + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_bmbt_decrement(tcur, level, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + } + } + if (lbno != NULLFSBLOCK) { + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * decrement to last in block + */ + if ((error = xfs_bmbt_decrement(tcur, level, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + lbp = tcur->bc_bufs[level]; + left = XFS_BUF_TO_BMBT_BLOCK(lbp); +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } +#endif + bno = INT_GET(left->bb_rightsib, ARCH_CONVERT); + if (INT_GET(left->bb_numrecs, ARCH_CONVERT) - 1 >= + XFS_BMAP_BLOCK_IMINRECS(level, cur)) { + if ((error = xfs_bmbt_rshift(tcur, level, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + if (i) { + ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >= + XFS_BMAP_BLOCK_IMINRECS(level, tcur)); + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + tcur = NULL; + if (level == 0) + cur->bc_ptrs[0]++; + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; + } + } + lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT); + } + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + tcur = NULL; + mp = cur->bc_mp; + ASSERT(bno != NULLFSBLOCK); + if (lbno != NULLFSBLOCK && + lrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <= XFS_BMAP_BLOCK_IMAXRECS(level, cur)) { + rbno = bno; + right = block; + rbp = bp; + if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, lbno, 0, &lbp, + XFS_BMAP_BTREE_REF))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + left = XFS_BUF_TO_BMBT_BLOCK(lbp); + if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + } else if (rbno != NULLFSBLOCK && + rrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <= + XFS_BMAP_BLOCK_IMAXRECS(level, cur)) { + lbno = bno; + left = block; + lbp = bp; + if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, rbno, 0, &rbp, + XFS_BMAP_BTREE_REF))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + right = XFS_BUF_TO_BMBT_BLOCK(rbp); + if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT); + } else { + if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; + } + numlrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT); + numrrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT); + if (level > 0) { + lkp = XFS_BMAP_KEY_IADDR(left, numlrecs + 1, cur); + lpp = XFS_BMAP_PTR_IADDR(left, numlrecs + 1, cur); + rkp = XFS_BMAP_KEY_IADDR(right, 1, cur); + rpp = XFS_BMAP_PTR_IADDR(right, 1, cur); +#ifdef DEBUG + for (i = 0; i < numrrecs; i++) { + if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + } +#endif + bcopy(rkp, lkp, numrrecs * sizeof(*lkp)); + bcopy(rpp, lpp, numrrecs * sizeof(*lpp)); + xfs_bmbt_log_keys(cur, lbp, numlrecs + 1, numlrecs + numrrecs); + xfs_bmbt_log_ptrs(cur, lbp, numlrecs + 1, numlrecs + numrrecs); + } else { + lrp = XFS_BMAP_REC_IADDR(left, numlrecs + 1, cur); + rrp = XFS_BMAP_REC_IADDR(right, 1, cur); + bcopy(rrp, lrp, numrrecs * sizeof(*lrp)); + xfs_bmbt_log_recs(cur, lbp, numlrecs + 1, numlrecs + numrrecs); + } + INT_MOD(left->bb_numrecs, ARCH_CONVERT, numrrecs); + left->bb_rightsib = right->bb_rightsib; /* INT_: direct copy */ + xfs_bmbt_log_block(cur, lbp, XFS_BB_RIGHTSIB | XFS_BB_NUMRECS); + if (INT_GET(left->bb_rightsib, ARCH_CONVERT) != NULLDFSBNO) { + if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, + INT_GET(left->bb_rightsib, ARCH_CONVERT), 0, &rrbp, + XFS_BMAP_BTREE_REF))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp); + if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, lbno); + xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB); + } + xfs_bmap_add_free(XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(rbp)), 1, + cur->bc_private.b.flist, mp); + if (!async) + xfs_trans_set_sync(cur->bc_tp); + cur->bc_private.b.ip->i_d.di_nblocks--; + xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, XFS_ILOG_CORE); + if (XFS_IS_QUOTA_ON(mp) && + cur->bc_private.b.ip->i_ino != mp->m_sb.sb_uquotino && + cur->bc_private.b.ip->i_ino != mp->m_sb.sb_gquotino) + xfs_trans_mod_dquot_byino(cur->bc_tp, cur->bc_private.b.ip, + XFS_TRANS_DQ_BCOUNT, -1L); + xfs_trans_binval(cur->bc_tp, rbp); + if (bp != lbp) { + cur->bc_bufs[level] = lbp; + cur->bc_ptrs[level] += lrecs; + cur->bc_ra[level] = 0; + } else if ((error = xfs_bmbt_increment(cur, level + 1, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + if (level > 0) + cur->bc_ptrs[level]--; + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 2; + return 0; + +error0: + if (tcur) + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); + return error; +} + +#ifdef XFSDEBUG +/* + * Get the data from the pointed-to record. + */ +int +xfs_bmbt_get_rec( + xfs_btree_cur_t *cur, + xfs_fileoff_t *off, + xfs_fsblock_t *bno, + xfs_filblks_t *len, + xfs_exntst_t *state, + int *stat) +{ + xfs_bmbt_block_t *block; + xfs_buf_t *bp; +#ifdef DEBUG + int error; +#endif + int ptr; + xfs_bmbt_rec_t *rp; + + block = xfs_bmbt_get_block(cur, 0, &bp); + ptr = cur->bc_ptrs[0]; +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, block, 0, bp))) + return error; +#endif + if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT) || ptr <= 0) { + *stat = 0; + return 0; + } + rp = XFS_BMAP_REC_IADDR(block, ptr, cur); + *off = xfs_bmbt_get_startoff(rp); + *bno = xfs_bmbt_get_startblock(rp); + *len = xfs_bmbt_get_blockcount(rp); + *state = xfs_bmbt_get_state(rp); + *stat = 1; + return 0; +} +#endif + +/* + * Insert one record/level. Return information to the caller + * allowing the next level up to proceed if necessary. + */ +STATIC int /* error */ +xfs_bmbt_insrec( + xfs_btree_cur_t *cur, + int level, + xfs_fsblock_t *bnop, + xfs_bmbt_rec_t *recp, + xfs_btree_cur_t **curp, + int *stat) /* no-go/done/continue */ +{ + xfs_bmbt_block_t *block; /* bmap btree block */ + xfs_buf_t *bp; /* buffer for block */ + int error; /* error return value */ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_insrec"; +#endif + int i; /* loop index */ + xfs_bmbt_key_t key; /* bmap btree key */ + xfs_bmbt_key_t *kp=NULL; /* pointer to bmap btree key */ + int logflags; /* inode logging flags */ + xfs_fsblock_t nbno; /* new block number */ + struct xfs_btree_cur *ncur; /* new btree cursor */ + xfs_bmbt_key_t nkey; /* new btree key value */ + xfs_bmbt_rec_t nrec; /* new record count */ + int optr; /* old key/record index */ + xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */ + int ptr; /* key/record index */ + xfs_bmbt_rec_t *rp=NULL; /* pointer to bmap btree rec */ + int numrecs; + + ASSERT(level < cur->bc_nlevels); + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGIFR(cur, level, *bnop, recp); + ncur = (xfs_btree_cur_t *)0; + INT_SET(key.br_startoff, ARCH_CONVERT, xfs_bmbt_get_startoff(recp)); + optr = ptr = cur->bc_ptrs[level]; + if (ptr == 0) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + XFS_STATS_INC(xfsstats.xs_bmbt_insrec); + block = xfs_bmbt_get_block(cur, level, &bp); + numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT); +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, block, level, bp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + if (ptr <= numrecs) { + if (level == 0) { + rp = XFS_BMAP_REC_IADDR(block, ptr, cur); + xfs_btree_check_rec(XFS_BTNUM_BMAP, recp, rp); + } else { + kp = XFS_BMAP_KEY_IADDR(block, ptr, cur); + xfs_btree_check_key(XFS_BTNUM_BMAP, &key, kp); + } + } +#endif + nbno = NULLFSBLOCK; + if (numrecs == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) { + if (numrecs < XFS_BMAP_BLOCK_DMAXRECS(level, cur)) { + /* + * A root block, that can be made bigger. + */ + xfs_iroot_realloc(cur->bc_private.b.ip, 1, + cur->bc_private.b.whichfork); + block = xfs_bmbt_get_block(cur, level, &bp); + } else if (level == cur->bc_nlevels - 1) { + if ((error = xfs_bmbt_newroot(cur, &logflags, stat)) || + *stat == 0) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, + logflags); + block = xfs_bmbt_get_block(cur, level, &bp); + } else { + if ((error = xfs_bmbt_rshift(cur, level, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + if (i) { + /* nothing */ + } else { + if ((error = xfs_bmbt_lshift(cur, level, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + if (i) { + optr = ptr = cur->bc_ptrs[level]; + } else { + if ((error = xfs_bmbt_split(cur, level, + &nbno, &nkey, &ncur, + &i))) { + XFS_BMBT_TRACE_CURSOR(cur, + ERROR); + return error; + } + if (i) { + block = xfs_bmbt_get_block( + cur, level, &bp); +#ifdef DEBUG + if ((error = + xfs_btree_check_lblock(cur, + block, level, bp))) { + XFS_BMBT_TRACE_CURSOR( + cur, ERROR); + return error; + } +#endif + ptr = cur->bc_ptrs[level]; + xfs_bmbt_set_allf(&nrec, + nkey.br_startoff, 0, 0, + XFS_EXT_NORM); + } else { + XFS_BMBT_TRACE_CURSOR(cur, + EXIT); + *stat = 0; + return 0; + } + } + } + } + } + numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT); + if (level > 0) { + kp = XFS_BMAP_KEY_IADDR(block, 1, cur); + pp = XFS_BMAP_PTR_IADDR(block, 1, cur); +#ifdef DEBUG + for (i = numrecs; i >= ptr; i--) { + if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i - 1], ARCH_CONVERT), + level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + } +#endif + ovbcopy(&kp[ptr - 1], &kp[ptr], + (numrecs - ptr + 1) * sizeof(*kp)); + ovbcopy(&pp[ptr - 1], &pp[ptr], /* INT_: direct copy */ + (numrecs - ptr + 1) * sizeof(*pp)); +#ifdef DEBUG + if ((error = xfs_btree_check_lptr(cur, (xfs_bmbt_ptr_t)*bnop, + level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + kp[ptr - 1] = key; + INT_SET(pp[ptr - 1], ARCH_CONVERT, *bnop); + numrecs++; + INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs); + xfs_bmbt_log_keys(cur, bp, ptr, numrecs); + xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs); + } else { + rp = XFS_BMAP_REC_IADDR(block, 1, cur); + ovbcopy(&rp[ptr - 1], &rp[ptr], + (numrecs - ptr + 1) * sizeof(*rp)); + rp[ptr - 1] = *recp; + numrecs++; + INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs); + xfs_bmbt_log_recs(cur, bp, ptr, numrecs); + } + xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS); +#ifdef DEBUG + if (ptr < numrecs) { + if (level == 0) + xfs_btree_check_rec(XFS_BTNUM_BMAP, rp + ptr - 1, + rp + ptr); + else + xfs_btree_check_key(XFS_BTNUM_BMAP, kp + ptr - 1, + kp + ptr); + } +#endif + if (optr == 1 && (error = xfs_bmbt_updkey(cur, &key, level + 1))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + *bnop = nbno; + if (nbno != NULLFSBLOCK) { + *recp = nrec; + *curp = ncur; + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; +} + +STATIC int +xfs_bmbt_killroot( + xfs_btree_cur_t *cur, + int async) +{ + xfs_bmbt_block_t *block; + xfs_bmbt_block_t *cblock; + xfs_buf_t *cbp; + xfs_bmbt_key_t *ckp; + xfs_bmbt_ptr_t *cpp; +#ifdef DEBUG + int error; +#endif +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_killroot"; +#endif + int i; + xfs_bmbt_key_t *kp; + xfs_inode_t *ip; + xfs_ifork_t *ifp; + int level; + xfs_bmbt_ptr_t *pp; + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + level = cur->bc_nlevels - 1; + ASSERT(level >= 1); + /* + * Don't deal with the root block needs to be a leaf case. + * We're just going to turn the thing back into extents anyway. + */ + if (level == 1) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + return 0; + } + block = xfs_bmbt_get_block(cur, level, &cbp); + /* + * Give up if the root has multiple children. + */ + if (INT_GET(block->bb_numrecs, ARCH_CONVERT) != 1) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + return 0; + } + /* + * Only do this if the next level will fit. + * Then the data must be copied up to the inode, + * instead of freeing the root you free the next level. + */ + cbp = cur->bc_bufs[level - 1]; + cblock = XFS_BUF_TO_BMBT_BLOCK(cbp); + if (INT_GET(cblock->bb_numrecs, ARCH_CONVERT) > XFS_BMAP_BLOCK_DMAXRECS(level, cur)) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + return 0; + } + ASSERT(INT_GET(cblock->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO); + ASSERT(INT_GET(cblock->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO); + ip = cur->bc_private.b.ip; + ifp = XFS_IFORK_PTR(ip, cur->bc_private.b.whichfork); + ASSERT(XFS_BMAP_BLOCK_IMAXRECS(level, cur) == + XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes)); + i = (int)(INT_GET(cblock->bb_numrecs, ARCH_CONVERT) - XFS_BMAP_BLOCK_IMAXRECS(level, cur)); + if (i) { + xfs_iroot_realloc(ip, i, cur->bc_private.b.whichfork); + block = ifp->if_broot; + } + INT_MOD(block->bb_numrecs, ARCH_CONVERT, i); + ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) == INT_GET(cblock->bb_numrecs, ARCH_CONVERT)); + kp = XFS_BMAP_KEY_IADDR(block, 1, cur); + ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur); + bcopy(ckp, kp, INT_GET(block->bb_numrecs, ARCH_CONVERT) * sizeof(*kp)); + pp = XFS_BMAP_PTR_IADDR(block, 1, cur); + cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur); +#ifdef DEBUG + for (i = 0; i < INT_GET(cblock->bb_numrecs, ARCH_CONVERT); i++) { + if ((error = xfs_btree_check_lptr(cur, INT_GET(cpp[i], ARCH_CONVERT), level - 1))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + } +#endif + bcopy(cpp, pp, INT_GET(block->bb_numrecs, ARCH_CONVERT) * sizeof(*pp)); + xfs_bmap_add_free(XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(cbp)), 1, + cur->bc_private.b.flist, cur->bc_mp); + if (!async) + xfs_trans_set_sync(cur->bc_tp); + ip->i_d.di_nblocks--; + if (XFS_IS_QUOTA_ON(cur->bc_mp) && + ip->i_ino != cur->bc_mp->m_sb.sb_uquotino && + ip->i_ino != cur->bc_mp->m_sb.sb_gquotino) + xfs_trans_mod_dquot_byino(cur->bc_tp, ip, XFS_TRANS_DQ_BCOUNT, + -1L); + xfs_trans_binval(cur->bc_tp, cbp); + cur->bc_bufs[level - 1] = NULL; + INT_MOD(block->bb_level, ARCH_CONVERT, -1); + xfs_trans_log_inode(cur->bc_tp, ip, + XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork)); + cur->bc_nlevels--; + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + return 0; +} + +/* + * Log key values from the btree block. + */ +STATIC void +xfs_bmbt_log_keys( + xfs_btree_cur_t *cur, + xfs_buf_t *bp, + int kfirst, + int klast) +{ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_log_keys"; +#endif + xfs_trans_t *tp; + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGBII(cur, bp, kfirst, klast); + tp = cur->bc_tp; + if (bp) { + xfs_bmbt_block_t *block; + int first; + xfs_bmbt_key_t *kp; + int last; + + block = XFS_BUF_TO_BMBT_BLOCK(bp); + kp = XFS_BMAP_KEY_DADDR(block, 1, cur); + first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block); + last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block); + xfs_trans_log_buf(tp, bp, first, last); + } else { + xfs_inode_t *ip; + + ip = cur->bc_private.b.ip; + xfs_trans_log_inode(tp, ip, + XFS_ILOG_FBROOT(cur->bc_private.b.whichfork)); + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); +} + +/* + * Log pointer values from the btree block. + */ +STATIC void +xfs_bmbt_log_ptrs( + xfs_btree_cur_t *cur, + xfs_buf_t *bp, + int pfirst, + int plast) +{ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_log_ptrs"; +#endif + xfs_trans_t *tp; + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGBII(cur, bp, pfirst, plast); + tp = cur->bc_tp; + if (bp) { + xfs_bmbt_block_t *block; + int first; + int last; + xfs_bmbt_ptr_t *pp; + + block = XFS_BUF_TO_BMBT_BLOCK(bp); + pp = XFS_BMAP_PTR_DADDR(block, 1, cur); + first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block); + last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block); + xfs_trans_log_buf(tp, bp, first, last); + } else { + xfs_inode_t *ip; + + ip = cur->bc_private.b.ip; + xfs_trans_log_inode(tp, ip, + XFS_ILOG_FBROOT(cur->bc_private.b.whichfork)); + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); +} + +/* + * Lookup the record. The cursor is made to point to it, based on dir. + */ +STATIC int /* error */ +xfs_bmbt_lookup( + xfs_btree_cur_t *cur, + xfs_lookup_t dir, + int *stat) /* success/failure */ +{ + xfs_bmbt_block_t *block=NULL; + xfs_buf_t *bp; + xfs_daddr_t d; + xfs_sfiloff_t diff; + int error; /* error return value */ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_lookup"; +#endif + xfs_fsblock_t fsbno=0; + int high; + int i; + int keyno=0; + xfs_bmbt_key_t *kkbase=NULL; + xfs_bmbt_key_t *kkp; + xfs_bmbt_rec_t *krbase=NULL; + xfs_bmbt_rec_t *krp; + int level; + int low; + xfs_mount_t *mp; + xfs_bmbt_ptr_t *pp; + xfs_bmbt_irec_t *rp; + xfs_fileoff_t startoff; + xfs_trans_t *tp; + + XFS_STATS_INC(xfsstats.xs_bmbt_lookup); + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGI(cur, (int)dir); + tp = cur->bc_tp; + mp = cur->bc_mp; + rp = &cur->bc_rec.b; + for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { + if (level < cur->bc_nlevels - 1) { + d = XFS_FSB_TO_DADDR(mp, fsbno); + bp = cur->bc_bufs[level]; + if (bp && XFS_BUF_ADDR(bp) != d) + bp = (xfs_buf_t *)0; + if (!bp) { + if ((error = xfs_btree_read_bufl(mp, tp, fsbno, + 0, &bp, XFS_BMAP_BTREE_REF))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + xfs_btree_setbuf(cur, level, bp); + block = XFS_BUF_TO_BMBT_BLOCK(bp); + if ((error = xfs_btree_check_lblock(cur, block, + level, bp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + } else + block = XFS_BUF_TO_BMBT_BLOCK(bp); + } else + block = xfs_bmbt_get_block(cur, level, &bp); + if (diff == 0) + keyno = 1; + else { + if (level > 0) + kkbase = XFS_BMAP_KEY_IADDR(block, 1, cur); + else + krbase = XFS_BMAP_REC_IADDR(block, 1, cur); + low = 1; + if (!(high = INT_GET(block->bb_numrecs, ARCH_CONVERT))) { + ASSERT(level == 0); + cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE; + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + while (low <= high) { + XFS_STATS_INC(xfsstats.xs_bmbt_compare); + keyno = (low + high) >> 1; + if (level > 0) { + kkp = kkbase + keyno - 1; + startoff = INT_GET(kkp->br_startoff, ARCH_CONVERT); + } else { + krp = krbase + keyno - 1; + startoff = xfs_bmbt_get_startoff(krp); + } + diff = (xfs_sfiloff_t) + (startoff - rp->br_startoff); + if (diff < 0) + low = keyno + 1; + else if (diff > 0) + high = keyno - 1; + else + break; + } + } + if (level > 0) { + if (diff > 0 && --keyno < 1) + keyno = 1; + pp = XFS_BMAP_PTR_IADDR(block, keyno, cur); +#ifdef DEBUG + if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + fsbno = INT_GET(*pp, ARCH_CONVERT); + cur->bc_ptrs[level] = keyno; + } + } + if (dir != XFS_LOOKUP_LE && diff < 0) { + keyno++; + /* + * If ge search and we went off the end of the block, but it's + * not the last block, we're in the wrong block. + */ + if (dir == XFS_LOOKUP_GE && keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT) && + INT_GET(block->bb_rightsib, ARCH_CONVERT) != NULLDFSBNO) { + cur->bc_ptrs[0] = keyno; + if ((error = xfs_bmbt_increment(cur, 0, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; + } + } + else if (dir == XFS_LOOKUP_LE && diff > 0) + keyno--; + cur->bc_ptrs[0] = keyno; + if (keyno == 0 || keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + } else { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0)); + } + return 0; +} + +/* + * Move 1 record left from cur/level if possible. + * Update cur to reflect the new path. + */ +STATIC int /* error */ +xfs_bmbt_lshift( + xfs_btree_cur_t *cur, + int level, + int *stat) /* success/failure */ +{ + int error; /* error return value */ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_lshift"; +#endif +#ifdef DEBUG + int i; /* loop counter */ +#endif + xfs_bmbt_key_t key; /* bmap btree key */ + xfs_buf_t *lbp; /* left buffer pointer */ + xfs_bmbt_block_t *left; /* left btree block */ + xfs_bmbt_key_t *lkp=NULL; /* left btree key */ + xfs_bmbt_ptr_t *lpp; /* left address pointer */ + int lrecs; /* left record count */ + xfs_bmbt_rec_t *lrp=NULL; /* left record pointer */ + xfs_mount_t *mp; /* file system mount point */ + xfs_buf_t *rbp; /* right buffer pointer */ + xfs_bmbt_block_t *right; /* right btree block */ + xfs_bmbt_key_t *rkp=NULL; /* right btree key */ + xfs_bmbt_ptr_t *rpp=NULL; /* right address pointer */ + xfs_bmbt_rec_t *rrp=NULL; /* right record pointer */ + int rrecs; /* right record count */ + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGI(cur, level); + if (level == cur->bc_nlevels - 1) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + rbp = cur->bc_bufs[level]; + right = XFS_BUF_TO_BMBT_BLOCK(rbp); +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + if (INT_GET(right->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + if (cur->bc_ptrs[level] <= 1) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + mp = cur->bc_mp; + if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, INT_GET(right->bb_leftsib, ARCH_CONVERT), 0, + &lbp, XFS_BMAP_BTREE_REF))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + left = XFS_BUF_TO_BMBT_BLOCK(lbp); + if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + if (INT_GET(left->bb_numrecs, ARCH_CONVERT) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1; + if (level > 0) { + lkp = XFS_BMAP_KEY_IADDR(left, lrecs, cur); + rkp = XFS_BMAP_KEY_IADDR(right, 1, cur); + *lkp = *rkp; + xfs_bmbt_log_keys(cur, lbp, lrecs, lrecs); + lpp = XFS_BMAP_PTR_IADDR(left, lrecs, cur); + rpp = XFS_BMAP_PTR_IADDR(right, 1, cur); +#ifdef DEBUG + if ((error = xfs_btree_check_lptr(cur, INT_GET(*rpp, ARCH_CONVERT), level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + *lpp = *rpp; /* INT_: direct copy */ + xfs_bmbt_log_ptrs(cur, lbp, lrecs, lrecs); + } else { + lrp = XFS_BMAP_REC_IADDR(left, lrecs, cur); + rrp = XFS_BMAP_REC_IADDR(right, 1, cur); + *lrp = *rrp; + xfs_bmbt_log_recs(cur, lbp, lrecs, lrecs); + } + INT_SET(left->bb_numrecs, ARCH_CONVERT, lrecs); + xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS); +#ifdef DEBUG + if (level > 0) + xfs_btree_check_key(XFS_BTNUM_BMAP, lkp - 1, lkp); + else + xfs_btree_check_rec(XFS_BTNUM_BMAP, lrp - 1, lrp); +#endif + rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1; + INT_SET(right->bb_numrecs, ARCH_CONVERT, rrecs); + xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS); + if (level > 0) { +#ifdef DEBUG + for (i = 0; i < rrecs; i++) { + if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i + 1], ARCH_CONVERT), + level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + } +#endif + ovbcopy(rkp + 1, rkp, rrecs * sizeof(*rkp)); + ovbcopy(rpp + 1, rpp, rrecs * sizeof(*rpp)); + xfs_bmbt_log_keys(cur, rbp, 1, rrecs); + xfs_bmbt_log_ptrs(cur, rbp, 1, rrecs); + } else { + ovbcopy(rrp + 1, rrp, rrecs * sizeof(*rrp)); + xfs_bmbt_log_recs(cur, rbp, 1, rrecs); + INT_SET(key.br_startoff, ARCH_CONVERT, xfs_bmbt_get_startoff(rrp)); + rkp = &key; + } + if ((error = xfs_bmbt_updkey(cur, rkp, level + 1))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + cur->bc_ptrs[level]--; + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; +} + +/* + * Move 1 record right from cur/level if possible. + * Update cur to reflect the new path. + */ +STATIC int /* error */ +xfs_bmbt_rshift( + xfs_btree_cur_t *cur, + int level, + int *stat) /* success/failure */ +{ + int error; /* error return value */ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_rshift"; +#endif + int i; /* loop counter */ + xfs_bmbt_key_t key; /* bmap btree key */ + xfs_buf_t *lbp; /* left buffer pointer */ + xfs_bmbt_block_t *left; /* left btree block */ + xfs_bmbt_key_t *lkp; /* left btree key */ + xfs_bmbt_ptr_t *lpp; /* left address pointer */ + xfs_bmbt_rec_t *lrp; /* left record pointer */ + xfs_mount_t *mp; /* file system mount point */ + xfs_buf_t *rbp; /* right buffer pointer */ + xfs_bmbt_block_t *right; /* right btree block */ + xfs_bmbt_key_t *rkp; /* right btree key */ + xfs_bmbt_ptr_t *rpp; /* right address pointer */ + xfs_bmbt_rec_t *rrp=NULL; /* right record pointer */ + struct xfs_btree_cur *tcur; /* temporary btree cursor */ + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGI(cur, level); + if (level == cur->bc_nlevels - 1) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + lbp = cur->bc_bufs[level]; + left = XFS_BUF_TO_BMBT_BLOCK(lbp); +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + if (INT_GET(left->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + if (cur->bc_ptrs[level] >= INT_GET(left->bb_numrecs, ARCH_CONVERT)) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + mp = cur->bc_mp; + if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0, + &rbp, XFS_BMAP_BTREE_REF))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + right = XFS_BUF_TO_BMBT_BLOCK(rbp); + if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + if (INT_GET(right->bb_numrecs, ARCH_CONVERT) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + if (level > 0) { + lkp = XFS_BMAP_KEY_IADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur); + lpp = XFS_BMAP_PTR_IADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur); + rkp = XFS_BMAP_KEY_IADDR(right, 1, cur); + rpp = XFS_BMAP_PTR_IADDR(right, 1, cur); +#ifdef DEBUG + for (i = INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1; i >= 0; i--) { + if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + } +#endif + ovbcopy(rkp, rkp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp)); + ovbcopy(rpp, rpp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp)); +#ifdef DEBUG + if ((error = xfs_btree_check_lptr(cur, INT_GET(*lpp, ARCH_CONVERT), level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + *rkp = *lkp; + *rpp = *lpp; /* INT_: direct copy */ + xfs_bmbt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1); + xfs_bmbt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1); + } else { + lrp = XFS_BMAP_REC_IADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur); + rrp = XFS_BMAP_REC_IADDR(right, 1, cur); + ovbcopy(rrp, rrp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp)); + *rrp = *lrp; + xfs_bmbt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1); + INT_SET(key.br_startoff, ARCH_CONVERT, xfs_bmbt_get_startoff(rrp)); + rkp = &key; + } + INT_MOD(left->bb_numrecs, ARCH_CONVERT, -1); + xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS); + INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1); +#ifdef DEBUG + if (level > 0) + xfs_btree_check_key(XFS_BTNUM_BMAP, rkp, rkp + 1); + else + xfs_btree_check_rec(XFS_BTNUM_BMAP, rrp, rrp + 1); +#endif + xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS); + if ((error = xfs_btree_dup_cursor(cur, &tcur))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_bmbt_increment(tcur, level, &i))) { + XFS_BMBT_TRACE_CURSOR(tcur, ERROR); + goto error1; + } + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_bmbt_updkey(tcur, rkp, level + 1))) { + XFS_BMBT_TRACE_CURSOR(tcur, ERROR); + goto error1; + } + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; +error0: + XFS_BMBT_TRACE_CURSOR(cur, ERROR); +error1: + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); + return error; +} + +/* + * Determine the extent state. + */ +/* ARGSUSED */ +STATIC xfs_exntst_t +xfs_extent_state( + xfs_filblks_t blks, + int extent_flag) +{ + if (extent_flag) { + ASSERT(blks != 0); /* saved for DMIG */ + return XFS_EXT_UNWRITTEN; + } + return XFS_EXT_NORM; +} + + +/* + * Split cur/level block in half. + * Return new block number and its first record (to be inserted into parent). + */ +STATIC int /* error */ +xfs_bmbt_split( + xfs_btree_cur_t *cur, + int level, + xfs_fsblock_t *bnop, + xfs_bmbt_key_t *keyp, + xfs_btree_cur_t **curp, + int *stat) /* success/failure */ +{ + xfs_alloc_arg_t args; /* block allocation args */ + int error; /* error return value */ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_split"; +#endif + int i; /* loop counter */ + xfs_fsblock_t lbno; /* left sibling block number */ + xfs_buf_t *lbp; /* left buffer pointer */ + xfs_bmbt_block_t *left; /* left btree block */ + xfs_bmbt_key_t *lkp; /* left btree key */ + xfs_bmbt_ptr_t *lpp; /* left address pointer */ + xfs_bmbt_rec_t *lrp; /* left record pointer */ + xfs_buf_t *rbp; /* right buffer pointer */ + xfs_bmbt_block_t *right; /* right btree block */ + xfs_bmbt_key_t *rkp; /* right btree key */ + xfs_bmbt_ptr_t *rpp; /* right address pointer */ + xfs_bmbt_block_t *rrblock; /* right-right btree block */ + xfs_buf_t *rrbp; /* right-right buffer pointer */ + xfs_bmbt_rec_t *rrp; /* right record pointer */ + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGIFK(cur, level, *bnop, keyp); + args.tp = cur->bc_tp; + args.mp = cur->bc_mp; + lbp = cur->bc_bufs[level]; + lbno = XFS_DADDR_TO_FSB(args.mp, XFS_BUF_ADDR(lbp)); + left = XFS_BUF_TO_BMBT_BLOCK(lbp); + args.fsbno = cur->bc_private.b.firstblock; + if (args.fsbno == NULLFSBLOCK) { + args.fsbno = lbno; + args.type = XFS_ALLOCTYPE_START_BNO; + } else if (cur->bc_private.b.flist->xbf_low) + args.type = XFS_ALLOCTYPE_FIRST_AG; + else + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.mod = args.minleft = args.alignment = args.total = args.isfl = + args.userdata = args.minalignslop = 0; + args.minlen = args.maxlen = args.prod = 1; + args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL; + if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return XFS_ERROR(ENOSPC); + } + if ((error = xfs_alloc_vextent(&args))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + if (args.fsbno == NULLFSBLOCK) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + ASSERT(args.len == 1); + cur->bc_private.b.firstblock = args.fsbno; + cur->bc_private.b.allocated++; + cur->bc_private.b.ip->i_d.di_nblocks++; + xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE); + if (XFS_IS_QUOTA_ON(args.mp) && + cur->bc_private.b.ip->i_ino != args.mp->m_sb.sb_uquotino && + cur->bc_private.b.ip->i_ino != args.mp->m_sb.sb_gquotino) + xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip, + XFS_TRANS_DQ_BCOUNT, 1L); + rbp = xfs_btree_get_bufl(args.mp, args.tp, args.fsbno, 0); + right = XFS_BUF_TO_BMBT_BLOCK(rbp); +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, left, level, rbp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + INT_SET(right->bb_magic, ARCH_CONVERT, XFS_BMAP_MAGIC); + right->bb_level = left->bb_level; /* INT_: direct copy */ + INT_SET(right->bb_numrecs, ARCH_CONVERT, (__uint16_t)(INT_GET(left->bb_numrecs, ARCH_CONVERT) / 2)); + if ((INT_GET(left->bb_numrecs, ARCH_CONVERT) & 1) && + cur->bc_ptrs[level] <= INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1) + INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1); + i = INT_GET(left->bb_numrecs, ARCH_CONVERT) - INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1; + if (level > 0) { + lkp = XFS_BMAP_KEY_IADDR(left, i, cur); + lpp = XFS_BMAP_PTR_IADDR(left, i, cur); + rkp = XFS_BMAP_KEY_IADDR(right, 1, cur); + rpp = XFS_BMAP_PTR_IADDR(right, 1, cur); +#ifdef DEBUG + for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) { + if ((error = xfs_btree_check_lptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + } +#endif + bcopy(lkp, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp)); + bcopy(lpp, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp)); + xfs_bmbt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + xfs_bmbt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + keyp->br_startoff = INT_GET(rkp->br_startoff, ARCH_CONVERT); + } else { + lrp = XFS_BMAP_REC_IADDR(left, i, cur); + rrp = XFS_BMAP_REC_IADDR(right, 1, cur); + bcopy(lrp, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp)); + xfs_bmbt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + keyp->br_startoff = xfs_bmbt_get_startoff(rrp); + } + INT_MOD(left->bb_numrecs, ARCH_CONVERT, -(INT_GET(right->bb_numrecs, ARCH_CONVERT))); + right->bb_rightsib = left->bb_rightsib; /* INT_: direct copy */ + INT_SET(left->bb_rightsib, ARCH_CONVERT, args.fsbno); + INT_SET(right->bb_leftsib, ARCH_CONVERT, lbno); + xfs_bmbt_log_block(cur, rbp, XFS_BB_ALL_BITS); + xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); + if (INT_GET(right->bb_rightsib, ARCH_CONVERT) != NULLDFSBNO) { + if ((error = xfs_btree_read_bufl(args.mp, args.tp, + INT_GET(right->bb_rightsib, ARCH_CONVERT), 0, &rrbp, + XFS_BMAP_BTREE_REF))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp); + if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, args.fsbno); + xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB); + } + if (cur->bc_ptrs[level] > INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1) { + xfs_btree_setbuf(cur, level, rbp); + cur->bc_ptrs[level] -= INT_GET(left->bb_numrecs, ARCH_CONVERT); + } + if (level + 1 < cur->bc_nlevels) { + if ((error = xfs_btree_dup_cursor(cur, curp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + (*curp)->bc_ptrs[level + 1]++; + } + *bnop = args.fsbno; + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; +} + + +/* + * Update keys for the record. + */ +STATIC int +xfs_bmbt_updkey( + xfs_btree_cur_t *cur, + xfs_bmbt_key_t *keyp, /* on-disk format */ + int level) +{ + xfs_bmbt_block_t *block; + xfs_buf_t *bp; +#ifdef DEBUG + int error; +#endif +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_updkey"; +#endif + xfs_bmbt_key_t *kp; + int ptr; + + ASSERT(level >= 1); + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGIK(cur, level, keyp); + for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { + block = xfs_bmbt_get_block(cur, level, &bp); +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, block, level, bp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + ptr = cur->bc_ptrs[level]; + kp = XFS_BMAP_KEY_IADDR(block, ptr, cur); + *kp = *keyp; + xfs_bmbt_log_keys(cur, bp, ptr, ptr); + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + return 0; +} + +/* + * Convert on-disk form of btree root to in-memory form. + */ +void +xfs_bmdr_to_bmbt( + xfs_bmdr_block_t *dblock, + int dblocklen, + xfs_bmbt_block_t *rblock, + int rblocklen) +{ + int dmxr; + xfs_bmbt_key_t *fkp; + xfs_bmbt_ptr_t *fpp; + xfs_bmbt_key_t *tkp; + xfs_bmbt_ptr_t *tpp; + + INT_SET(rblock->bb_magic, ARCH_CONVERT, XFS_BMAP_MAGIC); + rblock->bb_level = dblock->bb_level; /* both in on-disk format */ + ASSERT(INT_GET(rblock->bb_level, ARCH_CONVERT) > 0); + rblock->bb_numrecs = dblock->bb_numrecs;/* both in on-disk format */ + INT_SET(rblock->bb_leftsib, ARCH_CONVERT, NULLDFSBNO); + INT_SET(rblock->bb_rightsib, ARCH_CONVERT, NULLDFSBNO); + dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0); + fkp = XFS_BTREE_KEY_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr); + tkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen); + fpp = XFS_BTREE_PTR_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr); + tpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen); + dmxr = INT_GET(dblock->bb_numrecs, ARCH_CONVERT); + bcopy(fkp, tkp, sizeof(*fkp) * dmxr); + bcopy(fpp, tpp, sizeof(*fpp) * dmxr); /* INT_: direct copy */ +} + +/* + * Decrement cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_bmbt_decrement( + xfs_btree_cur_t *cur, + int level, + int *stat) /* success/failure */ +{ + xfs_bmbt_block_t *block; + xfs_buf_t *bp; + int error; /* error return value */ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_decrement"; +#endif + xfs_fsblock_t fsbno; + int lev; + xfs_mount_t *mp; + xfs_trans_t *tp; + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGI(cur, level); + ASSERT(level < cur->bc_nlevels); + if (level < cur->bc_nlevels - 1) + xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); + if (--cur->bc_ptrs[level] > 0) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; + } + block = xfs_bmbt_get_block(cur, level, &bp); +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, block, level, bp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + if (INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + for (lev = level + 1; lev < cur->bc_nlevels; lev++) { + if (--cur->bc_ptrs[lev] > 0) + break; + if (lev < cur->bc_nlevels - 1) + xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); + } + if (lev == cur->bc_nlevels) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + tp = cur->bc_tp; + mp = cur->bc_mp; + for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) { + fsbno = INT_GET(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT); + if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp, + XFS_BMAP_BTREE_REF))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + lev--; + xfs_btree_setbuf(cur, lev, bp); + block = XFS_BUF_TO_BMBT_BLOCK(bp); + if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + cur->bc_ptrs[lev] = INT_GET(block->bb_numrecs, ARCH_CONVERT); + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; +} + +/* + * Delete the record pointed to by cur. + */ +int /* error */ +xfs_bmbt_delete( + xfs_btree_cur_t *cur, + int async, /* deletion can be async */ + int *stat) /* success/failure */ +{ + int error; /* error return value */ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_delete"; +#endif + int i; + int level; + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + for (level = 0, i = 2; i == 2; level++) { + if ((error = xfs_bmbt_delrec(cur, level, async, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + } + if (i == 0) { + for (level = 1; level < cur->bc_nlevels; level++) { + if (cur->bc_ptrs[level] == 0) { + if ((error = xfs_bmbt_decrement(cur, level, + &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + break; + } + } + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = i; + return 0; +} + +/* + * Convert a compressed bmap extent record to an uncompressed form. + * This code must be in sync with the routines xfs_bmbt_get_startoff, + * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state. + */ +void +xfs_bmbt_get_all( + xfs_bmbt_rec_t *r, + xfs_bmbt_irec_t *s) +{ + int ext_flag; + xfs_exntst_t st; + __uint64_t l0, l1; + + l0 = INT_GET(r->l0, ARCH_CONVERT); + l1 = INT_GET(r->l1, ARCH_CONVERT); + ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN)); + s->br_startoff = ((xfs_fileoff_t)l0 & + XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; +#if XFS_BIG_FILESYSTEMS + s->br_startblock = (((xfs_fsblock_t)l0 & XFS_MASK64LO(9)) << 43) | + (((xfs_fsblock_t)l1) >> 21); +#else +#ifdef DEBUG + { + xfs_dfsbno_t b; + + b = (((xfs_dfsbno_t)l0 & XFS_MASK64LO(9)) << 43) | + (((xfs_dfsbno_t)l1) >> 21); + ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b)); + s->br_startblock = (xfs_fsblock_t)b; + } +#else /* !DEBUG */ + s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21); +#endif /* DEBUG */ +#endif /* XFS_BIG_FILESYSTEMS */ + s->br_blockcount = (xfs_filblks_t)(l1 & XFS_MASK64LO(21)); + /* This is xfs_extent_state() in-line */ + if (ext_flag) { + ASSERT(s->br_blockcount != 0); /* saved for DMIG */ + st = XFS_EXT_UNWRITTEN; + } else + st = XFS_EXT_NORM; + s->br_state = st; +} + +/* + * Get the block pointer for the given level of the cursor. + * Fill in the buffer pointer, if applicable. + */ +xfs_bmbt_block_t * +xfs_bmbt_get_block( + xfs_btree_cur_t *cur, + int level, + xfs_buf_t **bpp) +{ + xfs_ifork_t *ifp; + xfs_bmbt_block_t *rval; + + if (level < cur->bc_nlevels - 1) { + *bpp = cur->bc_bufs[level]; + rval = XFS_BUF_TO_BMBT_BLOCK(*bpp); + } else { + *bpp = 0; + ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, + cur->bc_private.b.whichfork); + rval = ifp->if_broot; + } + return rval; +} + +/* + * Extract the blockcount field from a bmap extent record. + */ +xfs_filblks_t +xfs_bmbt_get_blockcount( + xfs_bmbt_rec_t *r) +{ + return (xfs_filblks_t)(INT_GET(r->l1, ARCH_CONVERT) & XFS_MASK64LO(21)); +} + +/* + * Extract the startblock field from a bmap extent record. + */ +xfs_fsblock_t +xfs_bmbt_get_startblock( + xfs_bmbt_rec_t *r) +{ +#if XFS_BIG_FILESYSTEMS + return (((xfs_fsblock_t)INT_GET(r->l0, ARCH_CONVERT) & XFS_MASK64LO(9)) << 43) | + (((xfs_fsblock_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21); +#else +#ifdef DEBUG + xfs_dfsbno_t b; + + b = (((xfs_dfsbno_t)INT_GET(r->l0, ARCH_CONVERT) & XFS_MASK64LO(9)) << 43) | + (((xfs_dfsbno_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21); + ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b)); + return (xfs_fsblock_t)b; +#else /* !DEBUG */ + return (xfs_fsblock_t)(((xfs_dfsbno_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21); +#endif /* DEBUG */ +#endif /* XFS_BIG_FILESYSTEMS */ +} + +/* + * Extract the startoff field from a bmap extent record. + */ +xfs_fileoff_t +xfs_bmbt_get_startoff( + xfs_bmbt_rec_t *r) +{ + return ((xfs_fileoff_t)INT_GET(r->l0, ARCH_CONVERT) & + XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; +} + +xfs_exntst_t +xfs_bmbt_get_state( + xfs_bmbt_rec_t *r) +{ + int ext_flag; + + ext_flag = (int)((INT_GET(r->l0, ARCH_CONVERT)) >> (64 - BMBT_EXNTFLAG_BITLEN)); + return xfs_extent_state(xfs_bmbt_get_blockcount(r), + ext_flag); +} + + +/* + * Increment cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_bmbt_increment( + xfs_btree_cur_t *cur, + int level, + int *stat) /* success/failure */ +{ + xfs_bmbt_block_t *block; + xfs_buf_t *bp; + int error; /* error return value */ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_increment"; +#endif + xfs_fsblock_t fsbno; + int lev; + xfs_mount_t *mp; + xfs_trans_t *tp; + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGI(cur, level); + ASSERT(level < cur->bc_nlevels); + if (level < cur->bc_nlevels - 1) + xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); + block = xfs_bmbt_get_block(cur, level, &bp); +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, block, level, bp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + if (++cur->bc_ptrs[level] <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; + } + if (INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + for (lev = level + 1; lev < cur->bc_nlevels; lev++) { + block = xfs_bmbt_get_block(cur, lev, &bp); +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + if (++cur->bc_ptrs[lev] <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) + break; + if (lev < cur->bc_nlevels - 1) + xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); + } + if (lev == cur->bc_nlevels) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + tp = cur->bc_tp; + mp = cur->bc_mp; + for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) { + fsbno = INT_GET(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT); + if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp, + XFS_BMAP_BTREE_REF))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + lev--; + xfs_btree_setbuf(cur, lev, bp); + block = XFS_BUF_TO_BMBT_BLOCK(bp); + if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + cur->bc_ptrs[lev] = 1; + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; +} + +/* + * Insert the current record at the point referenced by cur. + */ +int /* error */ +xfs_bmbt_insert( + xfs_btree_cur_t *cur, + int *stat) /* success/failure */ +{ + int error; /* error return value */ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_insert"; +#endif + int i; + int level; + xfs_fsblock_t nbno; + xfs_btree_cur_t *ncur; + xfs_bmbt_rec_t nrec; + xfs_btree_cur_t *pcur; + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + level = 0; + nbno = NULLFSBLOCK; + xfs_bmbt_set_all(&nrec, &cur->bc_rec.b); + ncur = (xfs_btree_cur_t *)0; + pcur = cur; + do { + if ((error = xfs_bmbt_insrec(pcur, level++, &nbno, &nrec, &ncur, + &i))) { + if (pcur != cur) + xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) { + cur->bc_nlevels = pcur->bc_nlevels; + cur->bc_private.b.allocated += + pcur->bc_private.b.allocated; + pcur->bc_private.b.allocated = 0; + ASSERT((cur->bc_private.b.firstblock != NULLFSBLOCK) || + (cur->bc_private.b.ip->i_d.di_flags & + XFS_DIFLAG_REALTIME)); + cur->bc_private.b.firstblock = + pcur->bc_private.b.firstblock; + ASSERT(cur->bc_private.b.flist == + pcur->bc_private.b.flist); + xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); + } + if (ncur) { + pcur = ncur; + ncur = (xfs_btree_cur_t *)0; + } + } while (nbno != NULLFSBLOCK); + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = i; + return 0; +error0: + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; +} + +/* + * Log fields from the btree block header. + */ +void +xfs_bmbt_log_block( + xfs_btree_cur_t *cur, + xfs_buf_t *bp, + int fields) +{ + int first; +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_log_block"; +#endif + int last; + xfs_trans_t *tp; + static const short offsets[] = { + offsetof(xfs_bmbt_block_t, bb_magic), + offsetof(xfs_bmbt_block_t, bb_level), + offsetof(xfs_bmbt_block_t, bb_numrecs), + offsetof(xfs_bmbt_block_t, bb_leftsib), + offsetof(xfs_bmbt_block_t, bb_rightsib), + sizeof(xfs_bmbt_block_t) + }; + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGBI(cur, bp, fields); + tp = cur->bc_tp; + if (bp) { + xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, + &last); + xfs_trans_log_buf(tp, bp, first, last); + } else + xfs_trans_log_inode(tp, cur->bc_private.b.ip, + XFS_ILOG_FBROOT(cur->bc_private.b.whichfork)); + XFS_BMBT_TRACE_CURSOR(cur, EXIT); +} + +/* + * Log record values from the btree block. + */ +void +xfs_bmbt_log_recs( + xfs_btree_cur_t *cur, + xfs_buf_t *bp, + int rfirst, + int rlast) +{ + xfs_bmbt_block_t *block; + int first; +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_log_recs"; +#endif + int last; + xfs_bmbt_rec_t *rp; + xfs_trans_t *tp; + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGBII(cur, bp, rfirst, rlast); + ASSERT(bp); + tp = cur->bc_tp; + block = XFS_BUF_TO_BMBT_BLOCK(bp); + rp = XFS_BMAP_REC_DADDR(block, 1, cur); + first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block); + last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block); + xfs_trans_log_buf(tp, bp, first, last); + XFS_BMBT_TRACE_CURSOR(cur, EXIT); +} + +int /* error */ +xfs_bmbt_lookup_eq( + xfs_btree_cur_t *cur, + xfs_fileoff_t off, + xfs_fsblock_t bno, + xfs_filblks_t len, + int *stat) /* success/failure */ +{ + cur->bc_rec.b.br_startoff = off; + cur->bc_rec.b.br_startblock = bno; + cur->bc_rec.b.br_blockcount = len; + return xfs_bmbt_lookup(cur, XFS_LOOKUP_EQ, stat); +} + +int /* error */ +xfs_bmbt_lookup_ge( + xfs_btree_cur_t *cur, + xfs_fileoff_t off, + xfs_fsblock_t bno, + xfs_filblks_t len, + int *stat) /* success/failure */ +{ + cur->bc_rec.b.br_startoff = off; + cur->bc_rec.b.br_startblock = bno; + cur->bc_rec.b.br_blockcount = len; + return xfs_bmbt_lookup(cur, XFS_LOOKUP_GE, stat); +} + +int /* error */ +xfs_bmbt_lookup_le( + xfs_btree_cur_t *cur, + xfs_fileoff_t off, + xfs_fsblock_t bno, + xfs_filblks_t len, + int *stat) /* success/failure */ +{ + cur->bc_rec.b.br_startoff = off; + cur->bc_rec.b.br_startblock = bno; + cur->bc_rec.b.br_blockcount = len; + return xfs_bmbt_lookup(cur, XFS_LOOKUP_LE, stat); +} + +/* + * Give the bmap btree a new root block. Copy the old broot contents + * down into a real block and make the broot point to it. + */ +int /* error */ +xfs_bmbt_newroot( + xfs_btree_cur_t *cur, /* btree cursor */ + int *logflags, /* logging flags for inode */ + int *stat) /* return status - 0 fail */ +{ + xfs_alloc_arg_t args; /* allocation arguments */ + xfs_bmbt_block_t *block; /* bmap btree block */ + xfs_buf_t *bp; /* buffer for block */ + xfs_bmbt_block_t *cblock; /* child btree block */ + xfs_bmbt_key_t *ckp; /* child key pointer */ + xfs_bmbt_ptr_t *cpp; /* child ptr pointer */ + int error; /* error return code */ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_newroot"; +#endif +#ifdef DEBUG + int i; /* loop counter */ +#endif + xfs_bmbt_key_t *kp; /* pointer to bmap btree key */ + int level; /* btree level */ + xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */ + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + level = cur->bc_nlevels - 1; + block = xfs_bmbt_get_block(cur, level, &bp); + /* + * Copy the root into a real block. + */ + args.mp = cur->bc_mp; + pp = XFS_BMAP_PTR_IADDR(block, 1, cur); + args.tp = cur->bc_tp; + args.fsbno = cur->bc_private.b.firstblock; + args.mod = args.minleft = args.alignment = args.total = args.isfl = + args.userdata = args.minalignslop = 0; + args.minlen = args.maxlen = args.prod = 1; + args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL; + if (args.fsbno == NULLFSBLOCK) { +#ifdef DEBUG + if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + args.fsbno = INT_GET(*pp, ARCH_CONVERT); + args.type = XFS_ALLOCTYPE_START_BNO; + } else if (args.wasdel) + args.type = XFS_ALLOCTYPE_FIRST_AG; + else + args.type = XFS_ALLOCTYPE_NEAR_BNO; + if ((error = xfs_alloc_vextent(&args))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + if (args.fsbno == NULLFSBLOCK) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + ASSERT(args.len == 1); + cur->bc_private.b.firstblock = args.fsbno; + cur->bc_private.b.allocated++; + cur->bc_private.b.ip->i_d.di_nblocks++; + if (XFS_IS_QUOTA_ON(args.mp) && + cur->bc_private.b.ip->i_ino != args.mp->m_sb.sb_uquotino && + cur->bc_private.b.ip->i_ino != args.mp->m_sb.sb_gquotino) + xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip, + XFS_TRANS_DQ_BCOUNT, 1L); + bp = xfs_btree_get_bufl(args.mp, cur->bc_tp, args.fsbno, 0); + cblock = XFS_BUF_TO_BMBT_BLOCK(bp); + *cblock = *block; + INT_MOD(block->bb_level, ARCH_CONVERT, +1); + INT_SET(block->bb_numrecs, ARCH_CONVERT, 1); + cur->bc_nlevels++; + cur->bc_ptrs[level + 1] = 1; + kp = XFS_BMAP_KEY_IADDR(block, 1, cur); + ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur); + bcopy(kp, ckp, INT_GET(cblock->bb_numrecs, ARCH_CONVERT) * sizeof(*kp)); + cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur); +#ifdef DEBUG + for (i = 0; i < INT_GET(cblock->bb_numrecs, ARCH_CONVERT); i++) { + if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + } +#endif + bcopy(pp, cpp, INT_GET(cblock->bb_numrecs, ARCH_CONVERT) * sizeof(*pp)); +#ifdef DEBUG + if ((error = xfs_btree_check_lptr(cur, (xfs_bmbt_ptr_t)args.fsbno, + level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + INT_SET(*pp, ARCH_CONVERT, args.fsbno); + xfs_iroot_realloc(cur->bc_private.b.ip, 1 - INT_GET(cblock->bb_numrecs, ARCH_CONVERT), + cur->bc_private.b.whichfork); + xfs_btree_setbuf(cur, level, bp); + /* + * Do all this logging at the end so that + * the root is at the right level. + */ + xfs_bmbt_log_block(cur, bp, XFS_BB_ALL_BITS); + xfs_bmbt_log_keys(cur, bp, 1, INT_GET(cblock->bb_numrecs, ARCH_CONVERT)); + xfs_bmbt_log_ptrs(cur, bp, 1, INT_GET(cblock->bb_numrecs, ARCH_CONVERT)); + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *logflags |= + XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork); + *stat = 1; + return 0; +} + +/* + * Set all the fields in a bmap extent record from the uncompressed form. + */ +void +xfs_bmbt_set_all( + xfs_bmbt_rec_t *r, + xfs_bmbt_irec_t *s) +{ + int extent_flag; + + ASSERT((s->br_state == XFS_EXT_NORM) || + (s->br_state == XFS_EXT_UNWRITTEN)); + extent_flag = (s->br_state == XFS_EXT_NORM) ? 0 : 1; + ASSERT((s->br_startoff & XFS_MASK64HI(9)) == 0); + ASSERT((s->br_blockcount & XFS_MASK64HI(43)) == 0); +#if XFS_BIG_FILESYSTEMS + ASSERT((s->br_startblock & XFS_MASK64HI(12)) == 0); +#endif /* XFS_BIG_FILESYSTEMS */ +#if XFS_BIG_FILESYSTEMS + INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)s->br_startoff << 9) | + ((xfs_bmbt_rec_base_t)s->br_startblock >> 43)); + INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)s->br_startblock << 21) | + ((xfs_bmbt_rec_base_t)s->br_blockcount & + (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); +#else /* !XFS_BIG_FILESYSTEMS */ + if (ISNULLSTARTBLOCK(s->br_startblock)) { + INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)s->br_startoff << 9) | + (xfs_bmbt_rec_base_t)XFS_MASK64LO(9)); + INT_SET(r->l1, ARCH_CONVERT, XFS_MASK64HI(11) | + ((xfs_bmbt_rec_base_t)s->br_startblock << 21) | + ((xfs_bmbt_rec_base_t)s->br_blockcount & + (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); + } else { + INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)s->br_startoff << 9)); + INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)s->br_startblock << 21) | + ((xfs_bmbt_rec_base_t)s->br_blockcount & + (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); + } +#endif /* XFS_BIG_FILESYSTEMS */ +} + +/* + * Set all the fields in a bmap extent record from the arguments. + */ +void +xfs_bmbt_set_allf( + xfs_bmbt_rec_t *r, + xfs_fileoff_t o, + xfs_fsblock_t b, + xfs_filblks_t c, + xfs_exntst_t v) +{ + int extent_flag; + + ASSERT((v == XFS_EXT_NORM) || (v == XFS_EXT_UNWRITTEN)); + extent_flag = (v == XFS_EXT_NORM) ? 0 : 1; + ASSERT((o & XFS_MASK64HI(64-BMBT_STARTOFF_BITLEN)) == 0); + ASSERT((c & XFS_MASK64HI(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); +#if XFS_BIG_FILESYSTEMS + ASSERT((b & XFS_MASK64HI(64-BMBT_STARTBLOCK_BITLEN)) == 0); +#endif /* XFS_BIG_FILESYSTEMS */ +#if XFS_BIG_FILESYSTEMS + INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)o << 9) | + ((xfs_bmbt_rec_base_t)b >> 43)); + INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)b << 21) | + ((xfs_bmbt_rec_base_t)c & + (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); +#else /* !XFS_BIG_FILESYSTEMS */ + if (ISNULLSTARTBLOCK(b)) { + INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)o << 9) | + (xfs_bmbt_rec_base_t)XFS_MASK64LO(9)); + INT_SET(r->l1, ARCH_CONVERT, XFS_MASK64HI(11) | + ((xfs_bmbt_rec_base_t)b << 21) | + ((xfs_bmbt_rec_base_t)c & + (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); + } else { + INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)o << 9)); + INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)b << 21) | + ((xfs_bmbt_rec_base_t)c & + (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); + } +#endif /* XFS_BIG_FILESYSTEMS */ +} + +/* + * Set the blockcount field in a bmap extent record. + */ +void +xfs_bmbt_set_blockcount( + xfs_bmbt_rec_t *r, + xfs_filblks_t v) +{ + ASSERT((v & XFS_MASK64HI(43)) == 0); + INT_SET(r->l1, ARCH_CONVERT, (INT_GET(r->l1, ARCH_CONVERT) & (xfs_bmbt_rec_base_t)XFS_MASK64HI(43)) | + (xfs_bmbt_rec_base_t)(v & XFS_MASK64LO(21))); +} + +/* + * Set the startblock field in a bmap extent record. + */ +void +xfs_bmbt_set_startblock( + xfs_bmbt_rec_t *r, + xfs_fsblock_t v) +{ +#if XFS_BIG_FILESYSTEMS + ASSERT((v & XFS_MASK64HI(12)) == 0); +#endif /* XFS_BIG_FILESYSTEMS */ +#if XFS_BIG_FILESYSTEMS + INT_SET(r->l0, ARCH_CONVERT, (INT_GET(r->l0, ARCH_CONVERT) & (xfs_bmbt_rec_base_t)XFS_MASK64HI(55)) | + (xfs_bmbt_rec_base_t)(v >> 43)); + INT_SET(r->l1, ARCH_CONVERT, (INT_GET(r->l1, ARCH_CONVERT) & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)) | + (xfs_bmbt_rec_base_t)(v << 21)); +#else /* !XFS_BIG_FILESYSTEMS */ + if (ISNULLSTARTBLOCK(v)) { + INT_SET(r->l0, ARCH_CONVERT, (INT_GET(r->l0, ARCH_CONVERT) | (xfs_bmbt_rec_base_t)XFS_MASK64LO(9))); + INT_SET(r->l1, ARCH_CONVERT, (xfs_bmbt_rec_base_t)XFS_MASK64HI(11) | + ((xfs_bmbt_rec_base_t)v << 21) | + (INT_GET(r->l1, ARCH_CONVERT) & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); + } else { + INT_SET(r->l0, ARCH_CONVERT, (INT_GET(r->l0, ARCH_CONVERT) & ~(xfs_bmbt_rec_base_t)XFS_MASK64LO(9))); + INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)v << 21) | + (INT_GET(r->l1, ARCH_CONVERT) & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); + } +#endif /* XFS_BIG_FILESYSTEMS */ +} + +/* + * Set the startoff field in a bmap extent record. + */ +void +xfs_bmbt_set_startoff( + xfs_bmbt_rec_t *r, + xfs_fileoff_t v) +{ + ASSERT((v & XFS_MASK64HI(9)) == 0); + INT_SET(r->l0, ARCH_CONVERT, (INT_GET(r->l0, ARCH_CONVERT) & (xfs_bmbt_rec_base_t) XFS_MASK64HI(1)) | + ((xfs_bmbt_rec_base_t)v << 9) | + (INT_GET(r->l0, ARCH_CONVERT) & (xfs_bmbt_rec_base_t)XFS_MASK64LO(9))); +} + +/* + * Set the extent state field in a bmap extent record. + */ +void +xfs_bmbt_set_state( + xfs_bmbt_rec_t *r, + xfs_exntst_t v) +{ + ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN); + if (v == XFS_EXT_NORM) + INT_SET(r->l0, ARCH_CONVERT, INT_GET(r->l0, ARCH_CONVERT) & XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)); + else + INT_SET(r->l0, ARCH_CONVERT, INT_GET(r->l0, ARCH_CONVERT) | XFS_MASK64HI(BMBT_EXNTFLAG_BITLEN)); +} + +/* + * Convert in-memory form of btree root to on-disk form. + */ +void +xfs_bmbt_to_bmdr( + xfs_bmbt_block_t *rblock, + int rblocklen, + xfs_bmdr_block_t *dblock, + int dblocklen) +{ + int dmxr; + xfs_bmbt_key_t *fkp; + xfs_bmbt_ptr_t *fpp; + xfs_bmbt_key_t *tkp; + xfs_bmbt_ptr_t *tpp; + + ASSERT(INT_GET(rblock->bb_magic, ARCH_CONVERT) == XFS_BMAP_MAGIC); + ASSERT(INT_GET(rblock->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO); + ASSERT(INT_GET(rblock->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO); + ASSERT(INT_GET(rblock->bb_level, ARCH_CONVERT) > 0); + dblock->bb_level = rblock->bb_level; /* both in on-disk format */ + dblock->bb_numrecs = rblock->bb_numrecs;/* both in on-disk format */ + dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0); + fkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen); + tkp = XFS_BTREE_KEY_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr); + fpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen); + tpp = XFS_BTREE_PTR_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr); + dmxr = INT_GET(dblock->bb_numrecs, ARCH_CONVERT); + bcopy(fkp, tkp, sizeof(*fkp) * dmxr); + bcopy(fpp, tpp, sizeof(*fpp) * dmxr); /* INT_: direct copy */ +} + +/* + * Update the record to the passed values. + */ +int +xfs_bmbt_update( + xfs_btree_cur_t *cur, + xfs_fileoff_t off, + xfs_fsblock_t bno, + xfs_filblks_t len, + xfs_exntst_t state) +{ + xfs_bmbt_block_t *block; + xfs_buf_t *bp; + int error; +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_update"; +#endif + xfs_bmbt_key_t key; + int ptr; + xfs_bmbt_rec_t *rp; + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGFFFI(cur, (xfs_dfiloff_t)off, (xfs_dfsbno_t)bno, + (xfs_dfilblks_t)len, (int)state); + block = xfs_bmbt_get_block(cur, 0, &bp); +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, block, 0, bp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + ptr = cur->bc_ptrs[0]; + rp = XFS_BMAP_REC_IADDR(block, ptr, cur); + xfs_bmbt_set_allf(rp, off, bno, len, state); + xfs_bmbt_log_recs(cur, bp, ptr, ptr); + if (ptr > 1) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + return 0; + } + INT_SET(key.br_startoff, ARCH_CONVERT, off); + if ((error = xfs_bmbt_updkey(cur, &key, 1))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + return 0; +} + +/* + * Check an extent list, which has just been read, for + * any bit in the extent flag field. ASSERT on debug + * kernels, as this condition should not occur. + * Return an error condition (1) if any flags found, + * otherwise return 0. + */ +int +xfs_check_nostate_extents( + xfs_bmbt_rec_t *ep, + xfs_extnum_t num) +{ + for (; num > 0; num--, ep++) { + if (((INT_GET(ep->l0, ARCH_CONVERT)) >> + (64 - BMBT_EXNTFLAG_BITLEN)) != 0) { + ASSERT(0); + return 1; + } + } + return 0; +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_bmap_btree.h linux-2.4-xfs/fs/xfs/xfs_bmap_btree.h --- linux-2.4.19/fs/xfs/xfs_bmap_btree.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_bmap_btree.h Fri Jul 12 20:03:25 2002 @@ -0,0 +1,656 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_BMAP_BTREE_H__ +#define __XFS_BMAP_BTREE_H__ + +#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */ + +struct xfs_btree_cur; +struct xfs_btree_lblock; +struct xfs_mount; +struct xfs_inode; + +/* + * Bmap root header, on-disk form only. + */ +typedef struct xfs_bmdr_block +{ + __uint16_t bb_level; /* 0 is a leaf */ + __uint16_t bb_numrecs; /* current # of data records */ +} xfs_bmdr_block_t; + +/* + * Bmap btree record and extent descriptor. + * For 32-bit kernels, + * l0:31 is an extent flag (value 1 indicates non-normal). + * l0:0-30 and l1:9-31 are startoff. + * l1:0-8, l2:0-31, and l3:21-31 are startblock. + * l3:0-20 are blockcount. + * For 64-bit kernels, + * l0:63 is an extent flag (value 1 indicates non-normal). + * l0:9-62 are startoff. + * l0:0-8 and l1:21-63 are startblock. + * l1:0-20 are blockcount. + */ + +#if __BYTE_ORDER == __LITTLE_ENDIAN + +#define BMBT_TOTAL_BITLEN 128 /* 128 bits, 16 bytes */ +#define BMBT_EXNTFLAG_BITOFF 0 +#define BMBT_EXNTFLAG_BITLEN 1 +#define BMBT_STARTOFF_BITOFF (BMBT_EXNTFLAG_BITOFF + BMBT_EXNTFLAG_BITLEN) +#define BMBT_STARTOFF_BITLEN 54 +#define BMBT_STARTBLOCK_BITOFF (BMBT_STARTOFF_BITOFF + BMBT_STARTOFF_BITLEN) +#define BMBT_STARTBLOCK_BITLEN 52 +#define BMBT_BLOCKCOUNT_BITOFF \ + (BMBT_STARTBLOCK_BITOFF + BMBT_STARTBLOCK_BITLEN) +#define BMBT_BLOCKCOUNT_BITLEN (BMBT_TOTAL_BITLEN - BMBT_BLOCKCOUNT_BITOFF) + +#else + +#define BMBT_TOTAL_BITLEN 128 /* 128 bits, 16 bytes */ +#define BMBT_EXNTFLAG_BITOFF 63 +#define BMBT_EXNTFLAG_BITLEN 1 +#define BMBT_STARTOFF_BITOFF (BMBT_EXNTFLAG_BITOFF - BMBT_STARTOFF_BITLEN) +#define BMBT_STARTOFF_BITLEN 54 +#define BMBT_STARTBLOCK_BITOFF 85 /* 128 - 43 (other 9 is in first word) */ +#define BMBT_STARTBLOCK_BITLEN 52 +#define BMBT_BLOCKCOUNT_BITOFF 64 /* Start of second 64 bit container */ +#define BMBT_BLOCKCOUNT_BITLEN 21 + +#endif + + +#define BMBT_USE_64 1 + +typedef struct xfs_bmbt_rec_32 +{ + __uint32_t l0, l1, l2, l3; +} xfs_bmbt_rec_32_t; +typedef struct xfs_bmbt_rec_64 +{ + __uint64_t l0, l1; +} xfs_bmbt_rec_64_t; + +typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */ +typedef xfs_bmbt_rec_64_t xfs_bmbt_rec_t, xfs_bmdr_rec_t; + +/* + * Values and macros for delayed-allocation startblock fields. + */ +#define STARTBLOCKVALBITS 17 +#define STARTBLOCKMASKBITS (15 + XFS_BIG_FILESYSTEMS * 20) +#define DSTARTBLOCKMASKBITS (15 + 20) +#define STARTBLOCKMASK \ + (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) +#define DSTARTBLOCKMASK \ + (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_ISNULLSTARTBLOCK) +int isnullstartblock(xfs_fsblock_t x); +#define ISNULLSTARTBLOCK(x) isnullstartblock(x) +#else +#define ISNULLSTARTBLOCK(x) (((x) & STARTBLOCKMASK) == STARTBLOCKMASK) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_ISNULLDSTARTBLOCK) +int isnulldstartblock(xfs_dfsbno_t x); +#define ISNULLDSTARTBLOCK(x) isnulldstartblock(x) +#else +#define ISNULLDSTARTBLOCK(x) (((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_NULLSTARTBLOCK) +xfs_fsblock_t nullstartblock(int k); +#define NULLSTARTBLOCK(k) nullstartblock(k) +#else +#define NULLSTARTBLOCK(k) \ + ((ASSERT(k < (1 << STARTBLOCKVALBITS))), (STARTBLOCKMASK | (k))) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_STARTBLOCKVAL) +xfs_filblks_t startblockval(xfs_fsblock_t x); +#define STARTBLOCKVAL(x) startblockval(x) +#else +#define STARTBLOCKVAL(x) ((xfs_filblks_t)((x) & ~STARTBLOCKMASK)) +#endif + +/* + * Possible extent formats. + */ +typedef enum { + XFS_EXTFMT_NOSTATE = 0, + XFS_EXTFMT_HASSTATE +} xfs_exntfmt_t; + +/* + * Possible extent states. + */ +typedef enum { + XFS_EXT_NORM, XFS_EXT_UNWRITTEN, + XFS_EXT_DMAPI_OFFLINE +} xfs_exntst_t; + +/* + * Extent state and extent format macros. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_EXTFMT_INODE ) +xfs_exntfmt_t xfs_extfmt_inode(struct xfs_inode *ip); +#define XFS_EXTFMT_INODE(x) xfs_extfmt_inode(x) +#else +#define XFS_EXTFMT_INODE(x) \ + (XFS_SB_VERSION_HASEXTFLGBIT(&((x)->i_mount->m_sb)) ? \ + XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE) +#endif +#define ISUNWRITTEN(x) ((x) == XFS_EXT_UNWRITTEN) + +/* + * Incore version of above. + */ +typedef struct xfs_bmbt_irec +{ + xfs_fileoff_t br_startoff; /* starting file offset */ + xfs_fsblock_t br_startblock; /* starting block number */ + xfs_filblks_t br_blockcount; /* number of blocks */ + xfs_exntst_t br_state; /* extent state */ +} xfs_bmbt_irec_t; + +/* + * Key structure for non-leaf levels of the tree. + */ +typedef struct xfs_bmbt_key +{ + xfs_dfiloff_t br_startoff; /* starting file offset */ +} xfs_bmbt_key_t, xfs_bmdr_key_t; + +typedef xfs_dfsbno_t xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; /* btree pointer type */ + /* btree block header type */ +typedef struct xfs_btree_lblock xfs_bmbt_block_t; + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_BMBT_BLOCK) +xfs_bmbt_block_t *xfs_buf_to_bmbt_block(struct xfs_buf *bp); +#define XFS_BUF_TO_BMBT_BLOCK(bp) xfs_buf_to_bmbt_block(bp) +#else +#define XFS_BUF_TO_BMBT_BLOCK(bp) ((xfs_bmbt_block_t *)(XFS_BUF_PTR(bp))) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_RBLOCK_DSIZE) +int xfs_bmap_rblock_dsize(int lev, struct xfs_btree_cur *cur); +#define XFS_BMAP_RBLOCK_DSIZE(lev,cur) xfs_bmap_rblock_dsize(lev,cur) +#else +#define XFS_BMAP_RBLOCK_DSIZE(lev,cur) ((cur)->bc_private.b.forksize) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_RBLOCK_ISIZE) +int xfs_bmap_rblock_isize(int lev, struct xfs_btree_cur *cur); +#define XFS_BMAP_RBLOCK_ISIZE(lev,cur) xfs_bmap_rblock_isize(lev,cur) +#else +#define XFS_BMAP_RBLOCK_ISIZE(lev,cur) \ + ((int)XFS_IFORK_PTR((cur)->bc_private.b.ip, \ + (cur)->bc_private.b.whichfork)->if_broot_bytes) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_IBLOCK_SIZE) +int xfs_bmap_iblock_size(int lev, struct xfs_btree_cur *cur); +#define XFS_BMAP_IBLOCK_SIZE(lev,cur) xfs_bmap_iblock_size(lev,cur) +#else +#define XFS_BMAP_IBLOCK_SIZE(lev,cur) (1 << (cur)->bc_blocklog) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_DSIZE) +int xfs_bmap_block_dsize(int lev, struct xfs_btree_cur *cur); +#define XFS_BMAP_BLOCK_DSIZE(lev,cur) xfs_bmap_block_dsize(lev,cur) +#else +#define XFS_BMAP_BLOCK_DSIZE(lev,cur) \ + ((lev) == (cur)->bc_nlevels - 1 ? \ + XFS_BMAP_RBLOCK_DSIZE(lev,cur) : \ + XFS_BMAP_IBLOCK_SIZE(lev,cur)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_ISIZE) +int xfs_bmap_block_isize(int lev, struct xfs_btree_cur *cur); +#define XFS_BMAP_BLOCK_ISIZE(lev,cur) xfs_bmap_block_isize(lev,cur) +#else +#define XFS_BMAP_BLOCK_ISIZE(lev,cur) \ + ((lev) == (cur)->bc_nlevels - 1 ? \ + XFS_BMAP_RBLOCK_ISIZE(lev,cur) : \ + XFS_BMAP_IBLOCK_SIZE(lev,cur)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_DMAXRECS) +int xfs_bmap_block_dmaxrecs(int lev, struct xfs_btree_cur *cur); +#define XFS_BMAP_BLOCK_DMAXRECS(lev,cur) xfs_bmap_block_dmaxrecs(lev,cur) +#else +#define XFS_BMAP_BLOCK_DMAXRECS(lev,cur) \ + ((lev) == (cur)->bc_nlevels - 1 ? \ + XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur), \ + xfs_bmdr, (lev) == 0) : \ + ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0])) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_IMAXRECS) +int xfs_bmap_block_imaxrecs(int lev, struct xfs_btree_cur *cur); +#define XFS_BMAP_BLOCK_IMAXRECS(lev,cur) xfs_bmap_block_imaxrecs(lev,cur) +#else +#define XFS_BMAP_BLOCK_IMAXRECS(lev,cur) \ + ((lev) == (cur)->bc_nlevels - 1 ? \ + XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur), \ + xfs_bmbt, (lev) == 0) : \ + ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0])) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_DMINRECS) +int xfs_bmap_block_dminrecs(int lev, struct xfs_btree_cur *cur); +#define XFS_BMAP_BLOCK_DMINRECS(lev,cur) xfs_bmap_block_dminrecs(lev,cur) +#else +#define XFS_BMAP_BLOCK_DMINRECS(lev,cur) \ + ((lev) == (cur)->bc_nlevels - 1 ? \ + XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur), \ + xfs_bmdr, (lev) == 0) : \ + ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0])) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_IMINRECS) +int xfs_bmap_block_iminrecs(int lev, struct xfs_btree_cur *cur); +#define XFS_BMAP_BLOCK_IMINRECS(lev,cur) xfs_bmap_block_iminrecs(lev,cur) +#else +#define XFS_BMAP_BLOCK_IMINRECS(lev,cur) \ + ((lev) == (cur)->bc_nlevels - 1 ? \ + XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur), \ + xfs_bmbt, (lev) == 0) : \ + ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0])) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_REC_DADDR) +xfs_bmbt_rec_t * +xfs_bmap_rec_daddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur); +#define XFS_BMAP_REC_DADDR(bb,i,cur) xfs_bmap_rec_daddr(bb,i,cur) +#else +#define XFS_BMAP_REC_DADDR(bb,i,cur) \ + XFS_BTREE_REC_ADDR(XFS_BMAP_BLOCK_DSIZE( \ + INT_GET((bb)->bb_level, ARCH_CONVERT), cur), \ + xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \ + INT_GET((bb)->bb_level, ARCH_CONVERT), cur)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_REC_IADDR) +xfs_bmbt_rec_t * +xfs_bmap_rec_iaddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur); +#define XFS_BMAP_REC_IADDR(bb,i,cur) xfs_bmap_rec_iaddr(bb,i,cur) +#else +#define XFS_BMAP_REC_IADDR(bb,i,cur) \ + XFS_BTREE_REC_ADDR(XFS_BMAP_BLOCK_ISIZE( \ + INT_GET((bb)->bb_level, ARCH_CONVERT), cur), \ + xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \ + INT_GET((bb)->bb_level, ARCH_CONVERT), cur)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_KEY_DADDR) +xfs_bmbt_key_t * +xfs_bmap_key_daddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur); +#define XFS_BMAP_KEY_DADDR(bb,i,cur) xfs_bmap_key_daddr(bb,i,cur) +#else +#define XFS_BMAP_KEY_DADDR(bb,i,cur) \ + XFS_BTREE_KEY_ADDR(XFS_BMAP_BLOCK_DSIZE( \ + INT_GET((bb)->bb_level, ARCH_CONVERT), cur), \ + xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \ + INT_GET((bb)->bb_level, ARCH_CONVERT), cur)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_KEY_IADDR) +xfs_bmbt_key_t * +xfs_bmap_key_iaddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur); +#define XFS_BMAP_KEY_IADDR(bb,i,cur) xfs_bmap_key_iaddr(bb,i,cur) +#else +#define XFS_BMAP_KEY_IADDR(bb,i,cur) \ + XFS_BTREE_KEY_ADDR(XFS_BMAP_BLOCK_ISIZE( \ + INT_GET((bb)->bb_level, ARCH_CONVERT), cur), \ + xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \ + INT_GET((bb)->bb_level, ARCH_CONVERT), cur)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_PTR_DADDR) +xfs_bmbt_ptr_t * +xfs_bmap_ptr_daddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur); +#define XFS_BMAP_PTR_DADDR(bb,i,cur) xfs_bmap_ptr_daddr(bb,i,cur) +#else +#define XFS_BMAP_PTR_DADDR(bb,i,cur) \ + XFS_BTREE_PTR_ADDR(XFS_BMAP_BLOCK_DSIZE( \ + INT_GET((bb)->bb_level, ARCH_CONVERT), cur), \ + xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \ + INT_GET((bb)->bb_level, ARCH_CONVERT), cur)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_PTR_IADDR) +xfs_bmbt_ptr_t * +xfs_bmap_ptr_iaddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur); +#define XFS_BMAP_PTR_IADDR(bb,i,cur) xfs_bmap_ptr_iaddr(bb,i,cur) +#else +#define XFS_BMAP_PTR_IADDR(bb,i,cur) \ + XFS_BTREE_PTR_ADDR(XFS_BMAP_BLOCK_ISIZE( \ + INT_GET((bb)->bb_level, ARCH_CONVERT), cur), \ + xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \ + INT_GET((bb)->bb_level, ARCH_CONVERT), cur)) +#endif + +/* + * These are to be used when we know the size of the block and + * we don't have a cursor. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_REC_ADDR) +xfs_bmbt_rec_t *xfs_bmap_broot_rec_addr(xfs_bmbt_block_t *bb, int i, int sz); +#define XFS_BMAP_BROOT_REC_ADDR(bb,i,sz) xfs_bmap_broot_rec_addr(bb,i,sz) +#else +#define XFS_BMAP_BROOT_REC_ADDR(bb,i,sz) \ + XFS_BTREE_REC_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_KEY_ADDR) +xfs_bmbt_key_t *xfs_bmap_broot_key_addr(xfs_bmbt_block_t *bb, int i, int sz); +#define XFS_BMAP_BROOT_KEY_ADDR(bb,i,sz) xfs_bmap_broot_key_addr(bb,i,sz) +#else +#define XFS_BMAP_BROOT_KEY_ADDR(bb,i,sz) \ + XFS_BTREE_KEY_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_PTR_ADDR) +xfs_bmbt_ptr_t *xfs_bmap_broot_ptr_addr(xfs_bmbt_block_t *bb, int i, int sz); +#define XFS_BMAP_BROOT_PTR_ADDR(bb,i,sz) xfs_bmap_broot_ptr_addr(bb,i,sz) +#else +#define XFS_BMAP_BROOT_PTR_ADDR(bb,i,sz) \ + XFS_BTREE_PTR_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_NUMRECS) +int xfs_bmap_broot_numrecs(xfs_bmdr_block_t *bb); +#define XFS_BMAP_BROOT_NUMRECS(bb) xfs_bmap_broot_numrecs(bb) +#else +#define XFS_BMAP_BROOT_NUMRECS(bb) (INT_GET((bb)->bb_numrecs, ARCH_CONVERT)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_MAXRECS) +int xfs_bmap_broot_maxrecs(int sz); +#define XFS_BMAP_BROOT_MAXRECS(sz) xfs_bmap_broot_maxrecs(sz) +#else +#define XFS_BMAP_BROOT_MAXRECS(sz) XFS_BTREE_BLOCK_MAXRECS(sz,xfs_bmbt,0) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_SPACE_CALC) +int xfs_bmap_broot_space_calc(int nrecs); +#define XFS_BMAP_BROOT_SPACE_CALC(nrecs) xfs_bmap_broot_space_calc(nrecs) +#else +#define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \ + ((int)(sizeof(xfs_bmbt_block_t) + \ + ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_SPACE) +int xfs_bmap_broot_space(xfs_bmdr_block_t *bb); +#define XFS_BMAP_BROOT_SPACE(bb) xfs_bmap_broot_space(bb) +#else +#define XFS_BMAP_BROOT_SPACE(bb) \ + XFS_BMAP_BROOT_SPACE_CALC(INT_GET((bb)->bb_numrecs, ARCH_CONVERT)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMDR_SPACE_CALC) +int xfs_bmdr_space_calc(int nrecs); +#define XFS_BMDR_SPACE_CALC(nrecs) xfs_bmdr_space_calc(nrecs) +#else +#define XFS_BMDR_SPACE_CALC(nrecs) \ + ((int)(sizeof(xfs_bmdr_block_t) + \ + ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))) +#endif + +/* + * Maximum number of bmap btree levels. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BM_MAXLEVELS) +int xfs_bm_maxlevels(struct xfs_mount *mp, int w); +#define XFS_BM_MAXLEVELS(mp,w) xfs_bm_maxlevels(mp,w) +#else +#define XFS_BM_MAXLEVELS(mp,w) ((mp)->m_bm_maxlevels[w]) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_SANITY_CHECK) +int xfs_bmap_sanity_check(struct xfs_mount *mp, xfs_bmbt_block_t *bb, + int level); +#define XFS_BMAP_SANITY_CHECK(mp,bb,level) \ + xfs_bmap_sanity_check(mp,bb,level) +#else +#define XFS_BMAP_SANITY_CHECK(mp,bb,level) \ + (INT_GET((bb)->bb_magic, ARCH_CONVERT) == XFS_BMAP_MAGIC && \ + INT_GET((bb)->bb_level, ARCH_CONVERT) == level && \ + INT_GET((bb)->bb_numrecs, ARCH_CONVERT) > 0 && \ + INT_GET((bb)->bb_numrecs, ARCH_CONVERT) <= (mp)->m_bmap_dmxr[(level) != 0]) +#endif + +/* + * Trace buffer entry types. + */ +#define XFS_BMBT_KTRACE_ARGBI 1 +#define XFS_BMBT_KTRACE_ARGBII 2 +#define XFS_BMBT_KTRACE_ARGFFFI 3 +#define XFS_BMBT_KTRACE_ARGI 4 +#define XFS_BMBT_KTRACE_ARGIFK 5 +#define XFS_BMBT_KTRACE_ARGIFR 6 +#define XFS_BMBT_KTRACE_ARGIK 7 +#define XFS_BMBT_KTRACE_CUR 8 + +#define XFS_BMBT_TRACE_SIZE 4096 /* size of global trace buffer */ +#define XFS_BMBT_KTRACE_SIZE 32 /* size of per-inode trace buffer */ + +#if defined(XFS_ALL_TRACE) +#define XFS_BMBT_TRACE +#endif + +#if !defined(DEBUG) +#undef XFS_BMBT_TRACE +#endif + + +/* + * Prototypes for xfs_bmap.c to call. + */ + +void +xfs_bmdr_to_bmbt( + xfs_bmdr_block_t *, + int, + xfs_bmbt_block_t *, + int); + +int +xfs_bmbt_decrement( + struct xfs_btree_cur *, + int, + int *); + +int +xfs_bmbt_delete( + struct xfs_btree_cur *, + int, + int *); + +void +xfs_bmbt_get_all( + xfs_bmbt_rec_t *r, + xfs_bmbt_irec_t *s); + +xfs_bmbt_block_t * +xfs_bmbt_get_block( + struct xfs_btree_cur *cur, + int level, + struct xfs_buf **bpp); + +xfs_filblks_t +xfs_bmbt_get_blockcount( + xfs_bmbt_rec_t *r); + +xfs_fsblock_t +xfs_bmbt_get_startblock( + xfs_bmbt_rec_t *r); + +xfs_fileoff_t +xfs_bmbt_get_startoff( + xfs_bmbt_rec_t *r); + +xfs_exntst_t +xfs_bmbt_get_state( + xfs_bmbt_rec_t *r); + +int +xfs_bmbt_increment( + struct xfs_btree_cur *, + int, + int *); + +int +xfs_bmbt_insert( + struct xfs_btree_cur *, + int *); + +int +xfs_bmbt_insert_many( + struct xfs_btree_cur *, + int, + xfs_bmbt_rec_t *, + int *); + +void +xfs_bmbt_log_block( + struct xfs_btree_cur *, + struct xfs_buf *, + int); + +void +xfs_bmbt_log_recs( + struct xfs_btree_cur *, + struct xfs_buf *, + int, + int); + +int +xfs_bmbt_lookup_eq( + struct xfs_btree_cur *, + xfs_fileoff_t, + xfs_fsblock_t, + xfs_filblks_t, + int *); + +int +xfs_bmbt_lookup_ge( + struct xfs_btree_cur *, + xfs_fileoff_t, + xfs_fsblock_t, + xfs_filblks_t, + int *); + +int +xfs_bmbt_lookup_le( + struct xfs_btree_cur *, + xfs_fileoff_t, + xfs_fsblock_t, + xfs_filblks_t, + int *); + +/* + * Give the bmap btree a new root block. Copy the old broot contents + * down into a real block and make the broot point to it. + */ +int /* error */ +xfs_bmbt_newroot( + struct xfs_btree_cur *cur, /* btree cursor */ + int *logflags, /* logging flags for inode */ + int *stat); /* return status - 0 fail */ + +void +xfs_bmbt_set_all( + xfs_bmbt_rec_t *r, + xfs_bmbt_irec_t *s); + +void +xfs_bmbt_set_allf( + xfs_bmbt_rec_t *r, + xfs_fileoff_t o, + xfs_fsblock_t b, + xfs_filblks_t c, + xfs_exntst_t v); + +void +xfs_bmbt_set_blockcount( + xfs_bmbt_rec_t *r, + xfs_filblks_t v); + +void +xfs_bmbt_set_startblock( + xfs_bmbt_rec_t *r, + xfs_fsblock_t v); + +void +xfs_bmbt_set_startoff( + xfs_bmbt_rec_t *r, + xfs_fileoff_t v); + +void +xfs_bmbt_set_state( + xfs_bmbt_rec_t *r, + xfs_exntst_t v); + +void +xfs_bmbt_to_bmdr( + xfs_bmbt_block_t *, + int, + xfs_bmdr_block_t *, + int); + +int +xfs_bmbt_update( + struct xfs_btree_cur *, + xfs_fileoff_t, + xfs_fsblock_t, + xfs_filblks_t, + xfs_exntst_t); + +#ifdef XFSDEBUG +/* + * Get the data from the pointed-to record. + */ +int +xfs_bmbt_get_rec( + struct xfs_btree_cur *, + xfs_fileoff_t *, + xfs_fsblock_t *, + xfs_filblks_t *, + xfs_exntst_t *, + int *); +#endif + + +/* + * Search an extent list for the extent which includes block + * bno. + */ +xfs_bmbt_rec_t * +xfs_bmap_do_search_extents( + xfs_bmbt_rec_t *, + xfs_extnum_t, + xfs_extnum_t, + xfs_fileoff_t, + int *, + xfs_extnum_t *, + xfs_bmbt_irec_t *, + xfs_bmbt_irec_t *); + + +#endif /* __XFS_BMAP_BTREE_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_btree.c linux-2.4-xfs/fs/xfs/xfs_btree.c --- linux-2.4.19/fs/xfs/xfs_btree.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_btree.c Thu Aug 8 20:03:32 2002 @@ -0,0 +1,937 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * This file contains common code for the space manager's btree implementations. + */ + +#include + +/* + * Cursor allocation zone. + */ +kmem_zone_t *xfs_btree_cur_zone; + +/* + * Btree magic numbers. + */ +const __uint32_t xfs_magics[XFS_BTNUM_MAX] = +{ + XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC +}; + +/* + * Prototypes for internal routines. + */ + +/* + * Checking routine: return maxrecs for the block. + */ +STATIC int /* number of records fitting in block */ +xfs_btree_maxrecs( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_btree_block_t *block);/* generic btree block pointer */ + +/* + * Internal routines. + */ + +/* + * Checking routine: return maxrecs for the block. + */ +STATIC int /* number of records fitting in block */ +xfs_btree_maxrecs( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_btree_block_t *block) /* generic btree block pointer */ +{ + switch (cur->bc_btnum) { + case XFS_BTNUM_BNO: + case XFS_BTNUM_CNT: + return (int)XFS_ALLOC_BLOCK_MAXRECS(INT_GET(block->bb_h.bb_level, ARCH_CONVERT), cur); + case XFS_BTNUM_BMAP: + return (int)XFS_BMAP_BLOCK_IMAXRECS(INT_GET(block->bb_h.bb_level, ARCH_CONVERT), cur); + case XFS_BTNUM_INO: + return (int)XFS_INOBT_BLOCK_MAXRECS(INT_GET(block->bb_h.bb_level, ARCH_CONVERT), cur); + default: + ASSERT(0); + return 0; + } +} + +/* + * External routines. + */ + +#ifdef DEBUG +/* + * Debug routine: check that block header is ok. + */ +void +xfs_btree_check_block( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_btree_block_t *block, /* generic btree block pointer */ + int level, /* level of the btree block */ + xfs_buf_t *bp) /* buffer containing block, if any */ +{ + if (XFS_BTREE_LONG_PTRS(cur->bc_btnum)) + xfs_btree_check_lblock(cur, (xfs_btree_lblock_t *)block, level, + bp); + else + xfs_btree_check_sblock(cur, (xfs_btree_sblock_t *)block, level, + bp); +} + +/* + * Debug routine: check that keys are in the right order. + */ +void +xfs_btree_check_key( + xfs_btnum_t btnum, /* btree identifier */ + void *ak1, /* pointer to left (lower) key */ + void *ak2) /* pointer to right (higher) key */ +{ + switch (btnum) { + case XFS_BTNUM_BNO: { + xfs_alloc_key_t *k1; + xfs_alloc_key_t *k2; + + k1 = ak1; + k2 = ak2; + ASSERT(INT_GET(k1->ar_startblock, ARCH_CONVERT) < INT_GET(k2->ar_startblock, ARCH_CONVERT)); + break; + } + case XFS_BTNUM_CNT: { + xfs_alloc_key_t *k1; + xfs_alloc_key_t *k2; + + k1 = ak1; + k2 = ak2; + ASSERT(INT_GET(k1->ar_blockcount, ARCH_CONVERT) < INT_GET(k2->ar_blockcount, ARCH_CONVERT) || + (INT_GET(k1->ar_blockcount, ARCH_CONVERT) == INT_GET(k2->ar_blockcount, ARCH_CONVERT) && + INT_GET(k1->ar_startblock, ARCH_CONVERT) < INT_GET(k2->ar_startblock, ARCH_CONVERT))); + break; + } + case XFS_BTNUM_BMAP: { + xfs_bmbt_key_t *k1; + xfs_bmbt_key_t *k2; + + k1 = ak1; + k2 = ak2; + ASSERT(INT_GET(k1->br_startoff, ARCH_CONVERT) < INT_GET(k2->br_startoff, ARCH_CONVERT)); + break; + } + case XFS_BTNUM_INO: { + xfs_inobt_key_t *k1; + xfs_inobt_key_t *k2; + + k1 = ak1; + k2 = ak2; + ASSERT(INT_GET(k1->ir_startino, ARCH_CONVERT) < INT_GET(k2->ir_startino, ARCH_CONVERT)); + break; + } + default: + ASSERT(0); + } +} +#endif /* DEBUG */ + +/* + * Checking routine: check that long form block header is ok. + */ +/* ARGSUSED */ +int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_lblock( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_btree_lblock_t *block, /* btree long form block pointer */ + int level, /* level of the btree block */ + xfs_buf_t *bp) /* buffer for block, if any */ +{ + int lblock_ok; /* block passes checks */ + xfs_mount_t *mp; /* file system mount point */ + + mp = cur->bc_mp; + lblock_ok = + INT_GET(block->bb_magic, ARCH_CONVERT) == xfs_magics[cur->bc_btnum] && + INT_GET(block->bb_level, ARCH_CONVERT) == level && + INT_GET(block->bb_numrecs, ARCH_CONVERT) <= + xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) && + !INT_ISZERO(block->bb_leftsib, ARCH_CONVERT) && + (INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO || + XFS_FSB_SANITY_CHECK(mp, INT_GET(block->bb_leftsib, ARCH_CONVERT))) && + !INT_ISZERO(block->bb_rightsib, ARCH_CONVERT) && + (INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO || + XFS_FSB_SANITY_CHECK(mp, INT_GET(block->bb_rightsib, ARCH_CONVERT))); + if (XFS_TEST_ERROR(!lblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK, + XFS_RANDOM_BTREE_CHECK_LBLOCK)) { + if (bp) + xfs_buftrace("LBTREE ERROR", bp); +#ifdef __KERNEL__ /* additional, temporary, debugging code */ + cmn_err(CE_NOTE, + "EFSCORRUPTED returned from file %s line %d", + __FILE__, __LINE__); +#endif + return XFS_ERROR(EFSCORRUPTED); + } + return 0; +} + +/* + * Checking routine: check that (long) pointer is ok. + */ +int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_lptr( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_dfsbno_t ptr, /* btree block disk address */ + int level) /* btree block level */ +{ + xfs_mount_t *mp; /* file system mount point */ + + mp = cur->bc_mp; + XFS_WANT_CORRUPTED_RETURN( + level > 0 && + ptr != NULLDFSBNO && + XFS_FSB_SANITY_CHECK(mp, ptr)); + return 0; +} + +#ifdef DEBUG +/* + * Debug routine: check that records are in the right order. + */ +void +xfs_btree_check_rec( + xfs_btnum_t btnum, /* btree identifier */ + void *ar1, /* pointer to left (lower) record */ + void *ar2) /* pointer to right (higher) record */ +{ + switch (btnum) { + case XFS_BTNUM_BNO: { + xfs_alloc_rec_t *r1; + xfs_alloc_rec_t *r2; + + r1 = ar1; + r2 = ar2; + ASSERT(INT_GET(r1->ar_startblock, ARCH_CONVERT) + INT_GET(r1->ar_blockcount, ARCH_CONVERT) <= + INT_GET(r2->ar_startblock, ARCH_CONVERT)); + break; + } + case XFS_BTNUM_CNT: { + xfs_alloc_rec_t *r1; + xfs_alloc_rec_t *r2; + + r1 = ar1; + r2 = ar2; + ASSERT(INT_GET(r1->ar_blockcount, ARCH_CONVERT) < INT_GET(r2->ar_blockcount, ARCH_CONVERT) || + (INT_GET(r1->ar_blockcount, ARCH_CONVERT) == INT_GET(r2->ar_blockcount, ARCH_CONVERT) && + INT_GET(r1->ar_startblock, ARCH_CONVERT) < INT_GET(r2->ar_startblock, ARCH_CONVERT))); + break; + } + case XFS_BTNUM_BMAP: { + xfs_bmbt_rec_t *r1; + xfs_bmbt_rec_t *r2; + + r1 = ar1; + r2 = ar2; + ASSERT(xfs_bmbt_get_startoff(r1) + + xfs_bmbt_get_blockcount(r1) <= + xfs_bmbt_get_startoff(r2)); + break; + } + case XFS_BTNUM_INO: { + xfs_inobt_rec_t *r1; + xfs_inobt_rec_t *r2; + + r1 = ar1; + r2 = ar2; + ASSERT(INT_GET(r1->ir_startino, ARCH_CONVERT) + XFS_INODES_PER_CHUNK <= + INT_GET(r2->ir_startino, ARCH_CONVERT)); + break; + } + default: + ASSERT(0); + } +} +#endif /* DEBUG */ + +/* + * Checking routine: check that block header is ok. + */ +/* ARGSUSED */ +int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_sblock( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_btree_sblock_t *block, /* btree short form block pointer */ + int level, /* level of the btree block */ + xfs_buf_t *bp) /* buffer containing block */ +{ + xfs_buf_t *agbp; /* buffer for ag. freespace struct */ + xfs_agf_t *agf; /* ag. freespace structure */ + xfs_agblock_t agflen; /* native ag. freespace length */ + int sblock_ok; /* block passes checks */ + + agbp = cur->bc_private.a.agbp; + agf = XFS_BUF_TO_AGF(agbp); + agflen = INT_GET(agf->agf_length, ARCH_CONVERT); + sblock_ok = + INT_GET(block->bb_magic, ARCH_CONVERT) == xfs_magics[cur->bc_btnum] && + INT_GET(block->bb_level, ARCH_CONVERT) == level && + INT_GET(block->bb_numrecs, ARCH_CONVERT) <= + xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) && + (INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK || + INT_GET(block->bb_leftsib, ARCH_CONVERT) < agflen) && + !INT_ISZERO(block->bb_leftsib, ARCH_CONVERT) && + (INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK || + INT_GET(block->bb_rightsib, ARCH_CONVERT) < agflen) && + !INT_ISZERO(block->bb_rightsib, ARCH_CONVERT); + if (XFS_TEST_ERROR(!sblock_ok, cur->bc_mp, + XFS_ERRTAG_BTREE_CHECK_SBLOCK, + XFS_RANDOM_BTREE_CHECK_SBLOCK)) { + if (bp) + xfs_buftrace("SBTREE ERROR", bp); +#ifdef __KERNEL__ /* additional, temporary, debugging code */ + cmn_err(CE_NOTE, + "xfs_btree_check_sblock: Not OK:"); + cmn_err(CE_NOTE, + "magic 0x%x level %d numrecs %d leftsib %d rightsib %d", + INT_GET(block->bb_magic, ARCH_CONVERT), + INT_GET(block->bb_level, ARCH_CONVERT), + INT_GET(block->bb_numrecs, ARCH_CONVERT), + INT_GET(block->bb_leftsib, ARCH_CONVERT), + INT_GET(block->bb_rightsib, ARCH_CONVERT)); +#endif + return XFS_ERROR(EFSCORRUPTED); + } + return 0; +} + +/* + * Checking routine: check that (short) pointer is ok. + */ +int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_sptr( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_agblock_t ptr, /* btree block disk address */ + int level) /* btree block level */ +{ + xfs_buf_t *agbp; /* buffer for ag. freespace struct */ + xfs_agf_t *agf; /* ag. freespace structure */ + + agbp = cur->bc_private.a.agbp; + agf = XFS_BUF_TO_AGF(agbp); + XFS_WANT_CORRUPTED_RETURN( + level > 0 && + ptr != NULLAGBLOCK && ptr != 0 && + ptr < INT_GET(agf->agf_length, ARCH_CONVERT)); + return 0; +} + +/* + * Delete the btree cursor. + */ +void +xfs_btree_del_cursor( + xfs_btree_cur_t *cur, /* btree cursor */ + int error) /* del because of error */ +{ + int i; /* btree level */ + + /* + * Clear the buffer pointers, and release the buffers. + * If we're doing this in the face of an error, we + * need to make sure to inspect all of the entries + * in the bc_bufs array for buffers to be unlocked. + * This is because some of the btree code works from + * level n down to 0, and if we get an error along + * the way we won't have initialized all the entries + * down to 0. + */ + for (i = 0; i < cur->bc_nlevels; i++) { + if (cur->bc_bufs[i]) + xfs_btree_setbuf(cur, i, NULL); + else if (!error) + break; + } + /* + * Can't free a bmap cursor without having dealt with the + * allocated indirect blocks' accounting. + */ + ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || + cur->bc_private.b.allocated == 0); + /* + * Free the cursor. + */ + kmem_zone_free(xfs_btree_cur_zone, cur); +} + +/* + * Duplicate the btree cursor. + * Allocate a new one, copy the record, re-get the buffers. + */ +int /* error */ +xfs_btree_dup_cursor( + xfs_btree_cur_t *cur, /* input cursor */ + xfs_btree_cur_t **ncur) /* output cursor */ +{ + xfs_buf_t *bp; /* btree block's buffer pointer */ + int error; /* error return value */ + int i; /* level number of btree block */ + xfs_mount_t *mp; /* mount structure for filesystem */ + xfs_btree_cur_t *new; /* new cursor value */ + xfs_trans_t *tp; /* transaction pointer, can be NULL */ + + tp = cur->bc_tp; + mp = cur->bc_mp; + /* + * Allocate a new cursor like the old one. + */ + new = xfs_btree_init_cursor(mp, tp, cur->bc_private.a.agbp, + cur->bc_private.a.agno, cur->bc_btnum, cur->bc_private.b.ip, + cur->bc_private.b.whichfork); + /* + * Copy the record currently in the cursor. + */ + new->bc_rec = cur->bc_rec; + /* + * For each level current, re-get the buffer and copy the ptr value. + */ + for (i = 0; i < new->bc_nlevels; i++) { + new->bc_ptrs[i] = cur->bc_ptrs[i]; + new->bc_ra[i] = cur->bc_ra[i]; + if ((bp = cur->bc_bufs[i])) { + if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) { + xfs_btree_del_cursor(new, error); + *ncur = NULL; + return error; + } + new->bc_bufs[i] = bp; + ASSERT(bp); + ASSERT(!XFS_BUF_GETERROR(bp)); + } else + new->bc_bufs[i] = NULL; + } + /* + * For bmap btrees, copy the firstblock, flist, and flags values, + * since init cursor doesn't get them. + */ + if (new->bc_btnum == XFS_BTNUM_BMAP) { + new->bc_private.b.firstblock = cur->bc_private.b.firstblock; + new->bc_private.b.flist = cur->bc_private.b.flist; + new->bc_private.b.flags = cur->bc_private.b.flags; + } + *ncur = new; + return 0; +} + +/* + * Change the cursor to point to the first record at the given level. + * Other levels are unaffected. + */ +int /* success=1, failure=0 */ +xfs_btree_firstrec( + xfs_btree_cur_t *cur, /* btree cursor */ + int level) /* level to change */ +{ + xfs_btree_block_t *block; /* generic btree block pointer */ + xfs_buf_t *bp; /* buffer containing block */ + + /* + * Get the block pointer for this level. + */ + block = xfs_btree_get_block(cur, level, &bp); + xfs_btree_check_block(cur, block, level, bp); + /* + * It's empty, there is no such record. + */ + if (INT_ISZERO(block->bb_h.bb_numrecs, ARCH_CONVERT)) + return 0; + /* + * Set the ptr value to 1, that's the first record/key. + */ + cur->bc_ptrs[level] = 1; + return 1; +} + +/* + * Retrieve the block pointer from the cursor at the given level. + * This may be a bmap btree root or from a buffer. + */ +xfs_btree_block_t * /* generic btree block pointer */ +xfs_btree_get_block( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level in btree */ + xfs_buf_t **bpp) /* buffer containing the block */ +{ + xfs_btree_block_t *block; /* return value */ + xfs_buf_t *bp; /* return buffer */ + xfs_ifork_t *ifp; /* inode fork pointer */ + int whichfork; /* data or attr fork */ + + if (cur->bc_btnum == XFS_BTNUM_BMAP && level == cur->bc_nlevels - 1) { + whichfork = cur->bc_private.b.whichfork; + ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, whichfork); + block = (xfs_btree_block_t *)ifp->if_broot; + bp = NULL; + } else { + bp = cur->bc_bufs[level]; + block = XFS_BUF_TO_BLOCK(bp); + } + ASSERT(block != NULL); + *bpp = bp; + return block; +} + +/* + * Get a buffer for the block, return it with no data read. + * Long-form addressing. + */ +xfs_buf_t * /* buffer for fsbno */ +xfs_btree_get_bufl( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock) /* lock flags for get_buf */ +{ + xfs_buf_t *bp; /* buffer pointer (return value) */ + xfs_daddr_t d; /* real disk block address */ + + ASSERT(fsbno != NULLFSBLOCK); + d = XFS_FSB_TO_DADDR(mp, fsbno); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); + ASSERT(bp); + ASSERT(!XFS_BUF_GETERROR(bp)); + return bp; +} + +/* + * Get a buffer for the block, return it with no data read. + * Short-form addressing. + */ +xfs_buf_t * /* buffer for agno/agbno */ +xfs_btree_get_bufs( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + uint lock) /* lock flags for get_buf */ +{ + xfs_buf_t *bp; /* buffer pointer (return value) */ + xfs_daddr_t d; /* real disk block address */ + + ASSERT(agno != NULLAGNUMBER); + ASSERT(agbno != NULLAGBLOCK); + d = XFS_AGB_TO_DADDR(mp, agno, agbno); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); + ASSERT(bp); + ASSERT(!XFS_BUF_GETERROR(bp)); + return bp; +} + +/* + * Allocate a new btree cursor. + * The cursor is either for allocation (A) or bmap (B) or inodes (I). + */ +xfs_btree_cur_t * /* new btree cursor */ +xfs_btree_init_cursor( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *agbp, /* (A only) buffer for agf structure */ + /* (I only) buffer for agi structure */ + xfs_agnumber_t agno, /* (AI only) allocation group number */ + xfs_btnum_t btnum, /* btree identifier */ + xfs_inode_t *ip, /* (B only) inode owning the btree */ + int whichfork) /* (B only) data or attr fork */ +{ + xfs_agf_t *agf; /* (A) allocation group freespace */ + xfs_agi_t *agi; /* (I) allocation group inodespace */ + xfs_btree_cur_t *cur; /* return value */ + xfs_ifork_t *ifp; /* (I) inode fork pointer */ + int nlevels=0; /* number of levels in the btree */ + + ASSERT(xfs_btree_cur_zone != NULL); + /* + * Allocate a new cursor. + */ + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + /* + * Deduce the number of btree levels from the arguments. + */ + switch (btnum) { + case XFS_BTNUM_BNO: + case XFS_BTNUM_CNT: + agf = XFS_BUF_TO_AGF(agbp); + nlevels = INT_GET(agf->agf_levels[btnum], ARCH_CONVERT); + break; + case XFS_BTNUM_BMAP: + ifp = XFS_IFORK_PTR(ip, whichfork); + nlevels = INT_GET(ifp->if_broot->bb_level, ARCH_CONVERT) + 1; + break; + case XFS_BTNUM_INO: + agi = XFS_BUF_TO_AGI(agbp); + nlevels = INT_GET(agi->agi_level, ARCH_CONVERT); + break; + default: + ASSERT(0); + } + /* + * Fill in the common fields. + */ + cur->bc_tp = tp; + cur->bc_mp = mp; + cur->bc_nlevels = nlevels; + cur->bc_btnum = btnum; + cur->bc_blocklog = mp->m_sb.sb_blocklog; + /* + * Fill in private fields. + */ + switch (btnum) { + case XFS_BTNUM_BNO: + case XFS_BTNUM_CNT: + /* + * Allocation btree fields. + */ + cur->bc_private.a.agbp = agbp; + cur->bc_private.a.agno = agno; + break; + case XFS_BTNUM_BMAP: + /* + * Bmap btree fields. + */ + cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork); + cur->bc_private.b.ip = ip; + cur->bc_private.b.firstblock = NULLFSBLOCK; + cur->bc_private.b.flist = NULL; + cur->bc_private.b.allocated = 0; + cur->bc_private.b.flags = 0; + cur->bc_private.b.whichfork = whichfork; + break; + case XFS_BTNUM_INO: + /* + * Inode allocation btree fields. + */ + cur->bc_private.i.agbp = agbp; + cur->bc_private.i.agno = agno; + break; + default: + ASSERT(0); + } + return cur; +} + +/* + * Check for the cursor referring to the last block at the given level. + */ +int /* 1=is last block, 0=not last block */ +xfs_btree_islastblock( + xfs_btree_cur_t *cur, /* btree cursor */ + int level) /* level to check */ +{ + xfs_btree_block_t *block; /* generic btree block pointer */ + xfs_buf_t *bp; /* buffer containing block */ + + block = xfs_btree_get_block(cur, level, &bp); + xfs_btree_check_block(cur, block, level, bp); + if (XFS_BTREE_LONG_PTRS(cur->bc_btnum)) + return INT_GET(block->bb_u.l.bb_rightsib, ARCH_CONVERT) == NULLDFSBNO; + else + return INT_GET(block->bb_u.s.bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK; +} + +/* + * Change the cursor to point to the last record in the current block + * at the given level. Other levels are unaffected. + */ +int /* success=1, failure=0 */ +xfs_btree_lastrec( + xfs_btree_cur_t *cur, /* btree cursor */ + int level) /* level to change */ +{ + xfs_btree_block_t *block; /* generic btree block pointer */ + xfs_buf_t *bp; /* buffer containing block */ + + /* + * Get the block pointer for this level. + */ + block = xfs_btree_get_block(cur, level, &bp); + xfs_btree_check_block(cur, block, level, bp); + /* + * It's empty, there is no such record. + */ + if (INT_ISZERO(block->bb_h.bb_numrecs, ARCH_CONVERT)) + return 0; + /* + * Set the ptr value to numrecs, that's the last record/key. + */ + cur->bc_ptrs[level] = INT_GET(block->bb_h.bb_numrecs, ARCH_CONVERT); + return 1; +} + +/* + * Compute first and last byte offsets for the fields given. + * Interprets the offsets table, which contains struct field offsets. + */ +void +xfs_btree_offsets( + __int64_t fields, /* bitmask of fields */ + const short *offsets, /* table of field offsets */ + int nbits, /* number of bits to inspect */ + int *first, /* output: first byte offset */ + int *last) /* output: last byte offset */ +{ + int i; /* current bit number */ + __int64_t imask; /* mask for current bit number */ + + ASSERT(fields != 0); + /* + * Find the lowest bit, so the first byte offset. + */ + for (i = 0, imask = 1LL; ; i++, imask <<= 1) { + if (imask & fields) { + *first = offsets[i]; + break; + } + } + /* + * Find the highest bit, so the last byte offset. + */ + for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) { + if (imask & fields) { + *last = offsets[i + 1] - 1; + break; + } + } +} + +/* + * Get a buffer for the block, return it read in. + * Long-form addressing. + */ +int /* error */ +xfs_btree_read_bufl( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock, /* lock flags for read_buf */ + xfs_buf_t **bpp, /* buffer for fsbno */ + int refval) /* ref count value for buffer */ +{ + xfs_buf_t *bp; /* return value */ + xfs_daddr_t d; /* real disk block address */ + int error; + + ASSERT(fsbno != NULLFSBLOCK); + d = XFS_FSB_TO_DADDR(mp, fsbno); + if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, + mp->m_bsize, lock, &bp))) { + return error; + } + ASSERT(!bp || !XFS_BUF_GETERROR(bp)); + if (bp != NULL) { + XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); + } + *bpp = bp; + return 0; +} + +/* + * Get a buffer for the block, return it read in. + * Short-form addressing. + */ +int /* error */ +xfs_btree_read_bufs( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + uint lock, /* lock flags for read_buf */ + xfs_buf_t **bpp, /* buffer for agno/agbno */ + int refval) /* ref count value for buffer */ +{ + xfs_buf_t *bp; /* return value */ + xfs_daddr_t d; /* real disk block address */ + int error; + + ASSERT(agno != NULLAGNUMBER); + ASSERT(agbno != NULLAGBLOCK); + d = XFS_AGB_TO_DADDR(mp, agno, agbno); + if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, + mp->m_bsize, lock, &bp))) { + return error; + } + ASSERT(!bp || !XFS_BUF_GETERROR(bp)); + if (bp != NULL) { + switch (refval) { + case XFS_ALLOC_BTREE_REF: + XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); + break; + case XFS_INO_BTREE_REF: + XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, refval); + break; + } + } + *bpp = bp; + return 0; +} + +/* + * Read-ahead the block, don't wait for it, don't return a buffer. + * Long-form addressing. + */ +/* ARGSUSED */ +void +xfs_btree_reada_bufl( + xfs_mount_t *mp, /* file system mount point */ + xfs_fsblock_t fsbno, /* file system block number */ + xfs_extlen_t count) /* count of filesystem blocks */ +{ + xfs_daddr_t d; + + ASSERT(fsbno != NULLFSBLOCK); + d = XFS_FSB_TO_DADDR(mp, fsbno); + xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count); +} + +/* + * Read-ahead the block, don't wait for it, don't return a buffer. + * Short-form addressing. + */ +/* ARGSUSED */ +void +xfs_btree_reada_bufs( + xfs_mount_t *mp, /* file system mount point */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + xfs_extlen_t count) /* count of filesystem blocks */ +{ + xfs_daddr_t d; + + ASSERT(agno != NULLAGNUMBER); + ASSERT(agbno != NULLAGBLOCK); + d = XFS_AGB_TO_DADDR(mp, agno, agbno); + xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count); +} + +/* + * Read-ahead btree blocks, at the given level. + * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA. + */ +int +xfs_btree_readahead_core( + xfs_btree_cur_t *cur, /* btree cursor */ + int lev, /* level in btree */ + int lr) /* left/right bits */ +{ + xfs_alloc_block_t *a; + xfs_bmbt_block_t *b; + xfs_inobt_block_t *i; + int rval = 0; + + ASSERT(cur->bc_bufs[lev] != NULL); + cur->bc_ra[lev] |= lr; + switch (cur->bc_btnum) { + case XFS_BTNUM_BNO: + case XFS_BTNUM_CNT: + a = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]); + if ((lr & XFS_BTCUR_LEFTRA) && INT_GET(a->bb_leftsib, ARCH_CONVERT) != NULLAGBLOCK) { + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, + INT_GET(a->bb_leftsib, ARCH_CONVERT), 1); + rval++; + } + if ((lr & XFS_BTCUR_RIGHTRA) && INT_GET(a->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) { + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, + INT_GET(a->bb_rightsib, ARCH_CONVERT), 1); + rval++; + } + break; + case XFS_BTNUM_BMAP: + b = XFS_BUF_TO_BMBT_BLOCK(cur->bc_bufs[lev]); + if ((lr & XFS_BTCUR_LEFTRA) && INT_GET(b->bb_leftsib, ARCH_CONVERT) != NULLDFSBNO) { + xfs_btree_reada_bufl(cur->bc_mp, INT_GET(b->bb_leftsib, ARCH_CONVERT), 1); + rval++; + } + if ((lr & XFS_BTCUR_RIGHTRA) && INT_GET(b->bb_rightsib, ARCH_CONVERT) != NULLDFSBNO) { + xfs_btree_reada_bufl(cur->bc_mp, INT_GET(b->bb_rightsib, ARCH_CONVERT), 1); + rval++; + } + break; + case XFS_BTNUM_INO: + i = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]); + if ((lr & XFS_BTCUR_LEFTRA) && INT_GET(i->bb_leftsib, ARCH_CONVERT) != NULLAGBLOCK) { + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno, + INT_GET(i->bb_leftsib, ARCH_CONVERT), 1); + rval++; + } + if ((lr & XFS_BTCUR_RIGHTRA) && INT_GET(i->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) { + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno, + INT_GET(i->bb_rightsib, ARCH_CONVERT), 1); + rval++; + } + break; + default: + ASSERT(0); + } + return rval; +} + +/* + * Set the buffer for level "lev" in the cursor to bp, releasing + * any previous buffer. + */ +void +xfs_btree_setbuf( + xfs_btree_cur_t *cur, /* btree cursor */ + int lev, /* level in btree */ + xfs_buf_t *bp) /* new buffer to set */ +{ + xfs_btree_block_t *b; /* btree block */ + xfs_buf_t *obp; /* old buffer pointer */ + + obp = cur->bc_bufs[lev]; + if (obp) + xfs_trans_brelse(cur->bc_tp, obp); + cur->bc_bufs[lev] = bp; + cur->bc_ra[lev] = 0; + if (!bp) + return; + b = XFS_BUF_TO_BLOCK(bp); + if (XFS_BTREE_LONG_PTRS(cur->bc_btnum)) { + if (INT_GET(b->bb_u.l.bb_leftsib, ARCH_CONVERT) == NULLDFSBNO) + cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA; + if (INT_GET(b->bb_u.l.bb_rightsib, ARCH_CONVERT) == NULLDFSBNO) + cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA; + } else { + if (INT_GET(b->bb_u.s.bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK) + cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA; + if (INT_GET(b->bb_u.s.bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK) + cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA; + } +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_btree.h linux-2.4-xfs/fs/xfs/xfs_btree.h --- linux-2.4.19/fs/xfs/xfs_btree.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_btree.h Wed Jul 10 23:13:58 2002 @@ -0,0 +1,587 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_BTREE_H__ +#define __XFS_BTREE_H__ + +struct xfs_buf; +struct xfs_bmap_free; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; + +/* + * This nonsense is to make -wlint happy. + */ +#define XFS_LOOKUP_EQ ((xfs_lookup_t)XFS_LOOKUP_EQi) +#define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi) +#define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi) + +#define XFS_BTNUM_BNO ((xfs_btnum_t)XFS_BTNUM_BNOi) +#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi) +#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi) +#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi) + +/* + * Short form header: space allocation btrees. + */ +typedef struct xfs_btree_sblock +{ + __uint32_t bb_magic; /* magic number for block type */ + __uint16_t bb_level; /* 0 is a leaf */ + __uint16_t bb_numrecs; /* current # of data records */ + xfs_agblock_t bb_leftsib; /* left sibling block or NULLAGBLOCK */ + xfs_agblock_t bb_rightsib; /* right sibling block or NULLAGBLOCK */ +} xfs_btree_sblock_t; + +/* + * Long form header: bmap btrees. + */ +typedef struct xfs_btree_lblock +{ + __uint32_t bb_magic; /* magic number for block type */ + __uint16_t bb_level; /* 0 is a leaf */ + __uint16_t bb_numrecs; /* current # of data records */ + xfs_dfsbno_t bb_leftsib; /* left sibling block or NULLDFSBNO */ + xfs_dfsbno_t bb_rightsib; /* right sibling block or NULLDFSBNO */ +} xfs_btree_lblock_t; + +/* + * Combined header and structure, used by common code. + */ +typedef struct xfs_btree_hdr +{ + __uint32_t bb_magic; /* magic number for block type */ + __uint16_t bb_level; /* 0 is a leaf */ + __uint16_t bb_numrecs; /* current # of data records */ +} xfs_btree_hdr_t; + +typedef struct xfs_btree_block +{ + xfs_btree_hdr_t bb_h; /* header */ + union { + struct { + xfs_agblock_t bb_leftsib; + xfs_agblock_t bb_rightsib; + } s; /* short form pointers */ + struct { + xfs_dfsbno_t bb_leftsib; + xfs_dfsbno_t bb_rightsib; + } l; /* long form pointers */ + } bb_u; /* rest */ +} xfs_btree_block_t; + +/* + * For logging record fields. + */ +#define XFS_BB_MAGIC 0x01 +#define XFS_BB_LEVEL 0x02 +#define XFS_BB_NUMRECS 0x04 +#define XFS_BB_LEFTSIB 0x08 +#define XFS_BB_RIGHTSIB 0x10 +#define XFS_BB_NUM_BITS 5 +#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) + +/* + * Boolean to select which form of xfs_btree_block_t.bb_u to use. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BTREE_LONG_PTRS) +int xfs_btree_long_ptrs(xfs_btnum_t btnum); +#define XFS_BTREE_LONG_PTRS(btnum) ((btnum) == XFS_BTNUM_BMAP) +#else +#define XFS_BTREE_LONG_PTRS(btnum) ((btnum) == XFS_BTNUM_BMAP) +#endif + +/* + * Magic numbers for btree blocks. + */ +extern const __uint32_t xfs_magics[]; + +/* + * Maximum and minimum records in a btree block. + * Given block size, type prefix, and leaf flag (0 or 1). + * The divisor below is equivalent to lf ? (e1) : (e2) but that produces + * compiler warnings. + */ +#define XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf) \ + ((int)(((bsz) - (uint)sizeof(t ## _block_t)) / \ + (((lf) * (uint)sizeof(t ## _rec_t)) + \ + ((1 - (lf)) * \ + ((uint)sizeof(t ## _key_t) + (uint)sizeof(t ## _ptr_t)))))) +#define XFS_BTREE_BLOCK_MINRECS(bsz,t,lf) \ + (XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf) / 2) + +/* + * Record, key, and pointer address calculation macros. + * Given block size, type prefix, block pointer, and index of requested entry + * (first entry numbered 1). + */ +#define XFS_BTREE_REC_ADDR(bsz,t,bb,i,mxr) \ + ((t ## _rec_t *)((char *)(bb) + sizeof(t ## _block_t) + \ + ((i) - 1) * sizeof(t ## _rec_t))) +#define XFS_BTREE_KEY_ADDR(bsz,t,bb,i,mxr) \ + ((t ## _key_t *)((char *)(bb) + sizeof(t ## _block_t) + \ + ((i) - 1) * sizeof(t ## _key_t))) +#define XFS_BTREE_PTR_ADDR(bsz,t,bb,i,mxr) \ + ((t ## _ptr_t *)((char *)(bb) + sizeof(t ## _block_t) + \ + (mxr) * sizeof(t ## _key_t) + ((i) - 1) * sizeof(t ## _ptr_t))) + +#define XFS_BTREE_MAXLEVELS 8 /* max of all btrees */ + +/* + * Btree cursor structure. + * This collects all information needed by the btree code in one place. + */ +typedef struct xfs_btree_cur +{ + struct xfs_trans *bc_tp; /* transaction we're in, if any */ + struct xfs_mount *bc_mp; /* file system mount struct */ + union { + xfs_alloc_rec_t a; + xfs_bmbt_irec_t b; + xfs_inobt_rec_t i; + } bc_rec; /* current insert/search record value */ + struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */ + int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */ + __uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */ +#define XFS_BTCUR_LEFTRA 1 /* left sibling has been read-ahead */ +#define XFS_BTCUR_RIGHTRA 2 /* right sibling has been read-ahead */ + __uint8_t bc_nlevels; /* number of levels in the tree */ + __uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */ + xfs_btnum_t bc_btnum; /* identifies which btree type */ + union { + struct { /* needed for BNO, CNT */ + struct xfs_buf *agbp; /* agf buffer pointer */ + xfs_agnumber_t agno; /* ag number */ + } a; + struct { /* needed for BMAP */ + struct xfs_inode *ip; /* pointer to our inode */ + struct xfs_bmap_free *flist; /* list to free after */ + xfs_fsblock_t firstblock; /* 1st blk allocated */ + int allocated; /* count of alloced */ + short forksize; /* fork's inode space */ + char whichfork; /* data or attr fork */ + char flags; /* flags */ +#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */ + } b; + struct { /* needed for INO */ + struct xfs_buf *agbp; /* agi buffer pointer */ + xfs_agnumber_t agno; /* ag number */ + } i; + } bc_private; /* per-btree type data */ +} xfs_btree_cur_t; + +#define XFS_BTREE_NOERROR 0 +#define XFS_BTREE_ERROR 1 + +/* + * Convert from buffer to btree block header. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_BLOCK) +xfs_btree_block_t *xfs_buf_to_block(struct xfs_buf *bp); +#define XFS_BUF_TO_BLOCK(bp) xfs_buf_to_block(bp) +#else +#define XFS_BUF_TO_BLOCK(bp) ((xfs_btree_block_t *)(XFS_BUF_PTR(bp))) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_LBLOCK) +xfs_btree_lblock_t *xfs_buf_to_lblock(struct xfs_buf *bp); +#define XFS_BUF_TO_LBLOCK(bp) xfs_buf_to_lblock(bp) +#else +#define XFS_BUF_TO_LBLOCK(bp) ((xfs_btree_lblock_t *)(XFS_BUF_PTR(bp))) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_SBLOCK) +xfs_btree_sblock_t *xfs_buf_to_sblock(struct xfs_buf *bp); +#define XFS_BUF_TO_SBLOCK(bp) xfs_buf_to_sblock(bp) +#else +#define XFS_BUF_TO_SBLOCK(bp) ((xfs_btree_sblock_t *)(XFS_BUF_PTR(bp))) +#endif + +#ifdef __KERNEL__ + +#ifdef DEBUG +/* + * Debug routine: check that block header is ok. + */ +void +xfs_btree_check_block( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_btree_block_t *block, /* generic btree block pointer */ + int level, /* level of the btree block */ + struct xfs_buf *bp); /* buffer containing block, if any */ + +/* + * Debug routine: check that keys are in the right order. + */ +void +xfs_btree_check_key( + xfs_btnum_t btnum, /* btree identifier */ + void *ak1, /* pointer to left (lower) key */ + void *ak2); /* pointer to right (higher) key */ + +/* + * Debug routine: check that records are in the right order. + */ +void +xfs_btree_check_rec( + xfs_btnum_t btnum, /* btree identifier */ + void *ar1, /* pointer to left (lower) record */ + void *ar2); /* pointer to right (higher) record */ +#else +#define xfs_btree_check_block(a,b,c,d) +#define xfs_btree_check_key(a,b,c) +#define xfs_btree_check_rec(a,b,c) +#endif /* DEBUG */ + +/* + * Checking routine: check that long form block header is ok. + */ +int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_lblock( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_btree_lblock_t *block, /* btree long form block pointer */ + int level, /* level of the btree block */ + struct xfs_buf *bp); /* buffer containing block, if any */ + +/* + * Checking routine: check that (long) pointer is ok. + */ +int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_lptr( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_dfsbno_t ptr, /* btree block disk address */ + int level); /* btree block level */ + +/* + * Checking routine: check that short form block header is ok. + */ +int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_sblock( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_btree_sblock_t *block, /* btree short form block pointer */ + int level, /* level of the btree block */ + struct xfs_buf *bp); /* buffer containing block */ + +/* + * Checking routine: check that (short) pointer is ok. + */ +int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_sptr( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_agblock_t ptr, /* btree block disk address */ + int level); /* btree block level */ + +/* + * Delete the btree cursor. + */ +void +xfs_btree_del_cursor( + xfs_btree_cur_t *cur, /* btree cursor */ + int error); /* del because of error */ + +/* + * Duplicate the btree cursor. + * Allocate a new one, copy the record, re-get the buffers. + */ +int /* error */ +xfs_btree_dup_cursor( + xfs_btree_cur_t *cur, /* input cursor */ + xfs_btree_cur_t **ncur);/* output cursor */ + +/* + * Change the cursor to point to the first record in the current block + * at the given level. Other levels are unaffected. + */ +int /* success=1, failure=0 */ +xfs_btree_firstrec( + xfs_btree_cur_t *cur, /* btree cursor */ + int level); /* level to change */ + +/* + * Retrieve the block pointer from the cursor at the given level. + * This may be a bmap btree root or from a buffer. + */ +xfs_btree_block_t * /* generic btree block pointer */ +xfs_btree_get_block( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level in btree */ + struct xfs_buf **bpp); /* buffer containing the block */ + +/* + * Get a buffer for the block, return it with no data read. + * Long-form addressing. + */ +struct xfs_buf * /* buffer for fsbno */ +xfs_btree_get_bufl( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock); /* lock flags for get_buf */ + +/* + * Get a buffer for the block, return it with no data read. + * Short-form addressing. + */ +struct xfs_buf * /* buffer for agno/agbno */ +xfs_btree_get_bufs( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + uint lock); /* lock flags for get_buf */ + +/* + * Allocate a new btree cursor. + * The cursor is either for allocation (A) or bmap (B). + */ +xfs_btree_cur_t * /* new btree cursor */ +xfs_btree_init_cursor( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* (A only) buffer for agf structure */ + xfs_agnumber_t agno, /* (A only) allocation group number */ + xfs_btnum_t btnum, /* btree identifier */ + struct xfs_inode *ip, /* (B only) inode owning the btree */ + int whichfork); /* (B only) data/attr fork */ + +/* + * Check for the cursor referring to the last block at the given level. + */ +int /* 1=is last block, 0=not last block */ +xfs_btree_islastblock( + xfs_btree_cur_t *cur, /* btree cursor */ + int level); /* level to check */ + +/* + * Change the cursor to point to the last record in the current block + * at the given level. Other levels are unaffected. + */ +int /* success=1, failure=0 */ +xfs_btree_lastrec( + xfs_btree_cur_t *cur, /* btree cursor */ + int level); /* level to change */ + +/* + * Compute first and last byte offsets for the fields given. + * Interprets the offsets table, which contains struct field offsets. + */ +void +xfs_btree_offsets( + __int64_t fields, /* bitmask of fields */ + const short *offsets,/* table of field offsets */ + int nbits, /* number of bits to inspect */ + int *first, /* output: first byte offset */ + int *last); /* output: last byte offset */ + +/* + * Get a buffer for the block, return it read in. + * Long-form addressing. + */ +int /* error */ +xfs_btree_read_bufl( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock, /* lock flags for read_buf */ + struct xfs_buf **bpp, /* buffer for fsbno */ + int refval);/* ref count value for buffer */ + +/* + * Get a buffer for the block, return it read in. + * Short-form addressing. + */ +int /* error */ +xfs_btree_read_bufs( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + uint lock, /* lock flags for read_buf */ + struct xfs_buf **bpp, /* buffer for agno/agbno */ + int refval);/* ref count value for buffer */ + +/* + * Read-ahead the block, don't wait for it, don't return a buffer. + * Long-form addressing. + */ +void /* error */ +xfs_btree_reada_bufl( + struct xfs_mount *mp, /* file system mount point */ + xfs_fsblock_t fsbno, /* file system block number */ + xfs_extlen_t count); /* count of filesystem blocks */ + +/* + * Read-ahead the block, don't wait for it, don't return a buffer. + * Short-form addressing. + */ +void /* error */ +xfs_btree_reada_bufs( + struct xfs_mount *mp, /* file system mount point */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + xfs_extlen_t count); /* count of filesystem blocks */ + +/* + * Read-ahead btree blocks, at the given level. + * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA. + */ +int /* readahead block count */ +xfs_btree_readahead_core( + xfs_btree_cur_t *cur, /* btree cursor */ + int lev, /* level in btree */ + int lr); /* left/right bits */ + +static inline int /* readahead block count */ +xfs_btree_readahead( + xfs_btree_cur_t *cur, /* btree cursor */ + int lev, /* level in btree */ + int lr) /* left/right bits */ +{ + if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev]) + return 0; + + return xfs_btree_readahead_core(cur, lev, lr); +} + + +/* + * Set the buffer for level "lev" in the cursor to bp, releasing + * any previous buffer. + */ +void +xfs_btree_setbuf( + xfs_btree_cur_t *cur, /* btree cursor */ + int lev, /* level in btree */ + struct xfs_buf *bp); /* new buffer to set */ + +#endif /* __KERNEL__ */ + + +/* + * Min and max functions for extlen, agblock, fileoff, and filblks types. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_EXTLEN_MIN) +xfs_extlen_t xfs_extlen_min(xfs_extlen_t a, xfs_extlen_t b); +#define XFS_EXTLEN_MIN(a,b) xfs_extlen_min(a,b) +#else +#define XFS_EXTLEN_MIN(a,b) \ + ((xfs_extlen_t)(a) < (xfs_extlen_t)(b) ? \ + (xfs_extlen_t)(a) : (xfs_extlen_t)(b)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_EXTLEN_MAX) +xfs_extlen_t xfs_extlen_max(xfs_extlen_t a, xfs_extlen_t b); +#define XFS_EXTLEN_MAX(a,b) xfs_extlen_max(a,b) +#else +#define XFS_EXTLEN_MAX(a,b) \ + ((xfs_extlen_t)(a) > (xfs_extlen_t)(b) ? \ + (xfs_extlen_t)(a) : (xfs_extlen_t)(b)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGBLOCK_MIN) +xfs_agblock_t xfs_agblock_min(xfs_agblock_t a, xfs_agblock_t b); +#define XFS_AGBLOCK_MIN(a,b) xfs_agblock_min(a,b) +#else +#define XFS_AGBLOCK_MIN(a,b) \ + ((xfs_agblock_t)(a) < (xfs_agblock_t)(b) ? \ + (xfs_agblock_t)(a) : (xfs_agblock_t)(b)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGBLOCK_MAX) +xfs_agblock_t xfs_agblock_max(xfs_agblock_t a, xfs_agblock_t b); +#define XFS_AGBLOCK_MAX(a,b) xfs_agblock_max(a,b) +#else +#define XFS_AGBLOCK_MAX(a,b) \ + ((xfs_agblock_t)(a) > (xfs_agblock_t)(b) ? \ + (xfs_agblock_t)(a) : (xfs_agblock_t)(b)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FILEOFF_MIN) +xfs_fileoff_t xfs_fileoff_min(xfs_fileoff_t a, xfs_fileoff_t b); +#define XFS_FILEOFF_MIN(a,b) xfs_fileoff_min(a,b) +#else +#define XFS_FILEOFF_MIN(a,b) \ + ((xfs_fileoff_t)(a) < (xfs_fileoff_t)(b) ? \ + (xfs_fileoff_t)(a) : (xfs_fileoff_t)(b)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FILEOFF_MAX) +xfs_fileoff_t xfs_fileoff_max(xfs_fileoff_t a, xfs_fileoff_t b); +#define XFS_FILEOFF_MAX(a,b) xfs_fileoff_max(a,b) +#else +#define XFS_FILEOFF_MAX(a,b) \ + ((xfs_fileoff_t)(a) > (xfs_fileoff_t)(b) ? \ + (xfs_fileoff_t)(a) : (xfs_fileoff_t)(b)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FILBLKS_MIN) +xfs_filblks_t xfs_filblks_min(xfs_filblks_t a, xfs_filblks_t b); +#define XFS_FILBLKS_MIN(a,b) xfs_filblks_min(a,b) +#else +#define XFS_FILBLKS_MIN(a,b) \ + ((xfs_filblks_t)(a) < (xfs_filblks_t)(b) ? \ + (xfs_filblks_t)(a) : (xfs_filblks_t)(b)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FILBLKS_MAX) +xfs_filblks_t xfs_filblks_max(xfs_filblks_t a, xfs_filblks_t b); +#define XFS_FILBLKS_MAX(a,b) xfs_filblks_max(a,b) +#else +#define XFS_FILBLKS_MAX(a,b) \ + ((xfs_filblks_t)(a) > (xfs_filblks_t)(b) ? \ + (xfs_filblks_t)(a) : (xfs_filblks_t)(b)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_SANITY_CHECK) +int xfs_fsb_sanity_check(struct xfs_mount *mp, xfs_fsblock_t fsb); +#define XFS_FSB_SANITY_CHECK(mp,fsb) xfs_fsb_sanity_check(mp,fsb) +#else +#define XFS_FSB_SANITY_CHECK(mp,fsb) \ + (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \ + XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks) +#endif + +/* + * Macros to set EFSCORRUPTED & return/branch. + */ +#define XFS_WANT_CORRUPTED_GOTO(x,l) \ + { \ + int fs_is_ok = (x); \ + ASSERT(fs_is_ok); \ + if (!fs_is_ok) { \ + error = XFS_ERROR(EFSCORRUPTED); \ + goto l; \ + } \ + } + +#define XFS_WANT_CORRUPTED_RETURN(x) \ + { \ + int fs_is_ok = (x); \ + ASSERT(fs_is_ok); \ + if (!fs_is_ok) \ + return XFS_ERROR(EFSCORRUPTED); \ + } + +#endif /* __XFS_BTREE_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_buf.h linux-2.4-xfs/fs/xfs/xfs_buf.h --- linux-2.4.19/fs/xfs/xfs_buf.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_buf.h Tue Aug 13 20:51:16 2002 @@ -0,0 +1,319 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_BUF_H__ +#define __XFS_BUF_H__ + +/* These are just for xfs_syncsub... it sets an internal variable + * then passes it to VOP_FLUSH_PAGES or adds the flags to a newly gotten buf_t + */ +#define XFS_B_ASYNC PBF_ASYNC +#define XFS_B_DELWRI PBF_DELWRI +#define XFS_B_READ PBF_READ +#define XFS_B_WRITE PBF_WRITE +#define XFS_B_STALE PBF_STALE +#define XFS_BUF_TRYLOCK PBF_TRYLOCK +#define XFS_INCORE_TRYLOCK PBF_TRYLOCK +#define XFS_BUF_LOCK PBF_LOCK +#define XFS_BUF_MAPPED PBF_MAPPED + +#define BUF_BUSY PBF_DONT_BLOCK + +#define XFS_BUF_BFLAGS(x) ((x)->pb_flags) /* debugging routines might need this */ +#define XFS_BUF_ZEROFLAGS(x) \ + ((x)->pb_flags &= ~(PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_SYNC|PBF_DELWRI)) + +#define XFS_BUF_STALE(x) ((x)->pb_flags |= XFS_B_STALE) +#define XFS_BUF_UNSTALE(x) ((x)->pb_flags &= ~XFS_B_STALE) +#define XFS_BUF_ISSTALE(x) ((x)->pb_flags & XFS_B_STALE) +#define XFS_BUF_SUPER_STALE(x) (x)->pb_flags |= XFS_B_STALE;\ + xfs_buf_undelay(x);\ + (x)->pb_flags &= ~(PBF_PARTIAL|PBF_NONE) + +static inline void xfs_buf_undelay(page_buf_t *pb) +{ + if (pb->pb_flags & PBF_DELWRI) { + if (pb->pb_list.next != &pb->pb_list) { + pagebuf_delwri_dequeue(pb); + pagebuf_rele(pb); + } else { + pb->pb_flags &= ~PBF_DELWRI; + } + } +} + +#define XFS_BUF_DELAYWRITE(x) ((x)->pb_flags |= PBF_DELWRI) +#define XFS_BUF_UNDELAYWRITE(x) xfs_buf_undelay(x) +#define XFS_BUF_ISDELAYWRITE(x) ((x)->pb_flags & PBF_DELWRI) + +#define XFS_BUF_ERROR(x,no) pagebuf_ioerror(x,no) +#define XFS_BUF_GETERROR(x) pagebuf_geterror(x) +#define XFS_BUF_ISERROR(x) (pagebuf_geterror(x)?1:0) + +#define XFS_BUF_DONE(x) ((x)->pb_flags &= ~(PBF_PARTIAL|PBF_NONE)) +#define XFS_BUF_UNDONE(x) ((x)->pb_flags |= PBF_PARTIAL|PBF_NONE) +#define XFS_BUF_ISDONE(x) (!(PBF_NOT_DONE(x))) + +#define XFS_BUF_BUSY(x) ((x)->pb_flags |= PBF_FORCEIO) +#define XFS_BUF_UNBUSY(x) ((x)->pb_flags &= ~PBF_FORCEIO) +#define XFS_BUF_ISBUSY(x) (1) + +#define XFS_BUF_ASYNC(x) ((x)->pb_flags |= PBF_ASYNC) +#define XFS_BUF_UNASYNC(x) ((x)->pb_flags &= ~PBF_ASYNC) +#define XFS_BUF_ISASYNC(x) ((x)->pb_flags & PBF_ASYNC) + +#define XFS_BUF_FLUSH(x) ((x)->pb_flags |= PBF_FLUSH) +#define XFS_BUF_UNFLUSH(x) ((x)->pb_flags &= ~PBF_FLUSH) +#define XFS_BUF_ISFLUSH(x) ((x)->pb_flags & PBF_FLUSH) + +#define XFS_BUF_SHUT(x) printk("XFS_BUF_SHUT not implemented yet\n") +#define XFS_BUF_UNSHUT(x) printk("XFS_BUF_UNSHUT not implemented yet\n") +#define XFS_BUF_ISSHUT(x) (0) + +#define XFS_BUF_HOLD(x) pagebuf_hold(x) +#define XFS_BUF_READ(x) ((x)->pb_flags |= PBF_READ) +#define XFS_BUF_UNREAD(x) ((x)->pb_flags &= ~PBF_READ) +#define XFS_BUF_ISREAD(x) ((x)->pb_flags & PBF_READ) + +#define XFS_BUF_WRITE(x) ((x)->pb_flags |= PBF_WRITE) +#define XFS_BUF_UNWRITE(x) ((x)->pb_flags &= ~PBF_WRITE) +#define XFS_BUF_ISWRITE(x) ((x)->pb_flags & PBF_WRITE) + +#define XFS_BUF_ISUNINITIAL(x) ((x)->pb_flags & PBF_UNINITIAL) +#define XFS_BUF_UNUNINITIAL(x) ((x)->pb_flags &= ~PBF_UNINITIAL) + +#define XFS_BUF_BP_ISMAPPED(bp) 1 + +typedef struct page_buf_s xfs_buf_t; +#define xfs_buf page_buf_s + +typedef struct pb_target xfs_buftarg_t; +#define xfs_buftarg pb_target + +#define XFS_BUF_IODONE_FUNC(buf) (buf)->pb_iodone +#define XFS_BUF_SET_IODONE_FUNC(buf, func) \ + (buf)->pb_iodone = (func) +#define XFS_BUF_CLR_IODONE_FUNC(buf) \ + (buf)->pb_iodone = NULL +#define XFS_BUF_SET_BDSTRAT_FUNC(buf, func) \ + (buf)->pb_strat = (func) +#define XFS_BUF_CLR_BDSTRAT_FUNC(buf) \ + (buf)->pb_strat = NULL + +#define XFS_BUF_FSPRIVATE(buf, type) \ + ((type)(buf)->pb_fspriv) +#define XFS_BUF_SET_FSPRIVATE(buf, value) \ + (buf)->pb_fspriv = (void *)(value) +#define XFS_BUF_FSPRIVATE2(buf, type) \ + ((type)(buf)->pb_fspriv2) +#define XFS_BUF_SET_FSPRIVATE2(buf, value) \ + (buf)->pb_fspriv2 = (void *)(value) +#define XFS_BUF_FSPRIVATE3(buf, type) \ + ((type)(buf)->pb_fspriv3) +#define XFS_BUF_SET_FSPRIVATE3(buf, value) \ + (buf)->pb_fspriv3 = (void *)(value) +#define XFS_BUF_SET_START(buf) + +#define XFS_BUF_SET_BRELSE_FUNC(buf, value) \ + (buf)->pb_relse = (value) + +#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->pb_addr) + +extern inline xfs_caddr_t xfs_buf_offset(page_buf_t *bp, off_t offset) +{ + if (bp->pb_flags & PBF_MAPPED) + return XFS_BUF_PTR(bp) + offset; + return (xfs_caddr_t) pagebuf_offset(bp, offset); +} + +#define XFS_BUF_SET_PTR(bp, val, count) \ + pagebuf_associate_memory(bp, val, count) +#define XFS_BUF_ADDR(bp) ((bp)->pb_bn) +#define XFS_BUF_OFFSET(bp) ((bp)->pb_file_offset >> 9) +#define XFS_BUF_SET_ADDR(bp, blk) \ + ((bp)->pb_bn = (page_buf_daddr_t)(blk)) +#define XFS_BUF_COUNT(bp) ((bp)->pb_count_desired) +#define XFS_BUF_SET_COUNT(bp, cnt) \ + ((bp)->pb_count_desired = cnt) +#define XFS_BUF_SIZE(bp) ((bp)->pb_buffer_length) +#define XFS_BUF_SET_SIZE(bp, cnt) \ + ((bp)->pb_buffer_length = cnt) +#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) +#define XFS_BUF_SET_VTYPE(bp, type) +#define XFS_BUF_SET_REF(bp, ref) + +#define XFS_BUF_ISPINNED(bp) pagebuf_ispin(bp) + +#define XFS_BUF_VALUSEMA(bp) pagebuf_lock_value(bp) +#define XFS_BUF_CPSEMA(bp) (pagebuf_cond_lock(bp) == 0) +#define XFS_BUF_VSEMA(bp) pagebuf_unlock(bp) +#define XFS_BUF_PSEMA(bp,x) pagebuf_lock(bp) +#define XFS_BUF_V_IODONESEMA(bp) up(&bp->pb_iodonesema); + +/* setup the buffer target from a buftarg structure */ +#define XFS_BUF_SET_TARGET(bp, target) \ + (bp)->pb_target = (target) + +#define XFS_BUF_TARGET_DEV(bp) ((bp)->pb_target->pbr_dev) +#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) +#define XFS_BUF_SET_VTYPE(bp, type) +#define XFS_BUF_SET_REF(bp, ref) + +#define xfs_buf_read(target, blkno, len, flags) \ + pagebuf_get((target), (blkno), (len), \ + PBF_LOCK | PBF_READ | PBF_MAPPED | PBF_MAPPABLE) +#define xfs_buf_get(target, blkno, len, flags) \ + pagebuf_get((target), (blkno), (len), \ + PBF_LOCK | PBF_MAPPED | PBF_MAPPABLE) + +#define xfs_buf_read_flags(target, blkno, len, flags) \ + pagebuf_get((target), (blkno), (len), \ + PBF_READ | PBF_MAPPABLE | flags) +#define xfs_buf_get_flags(target, blkno, len, flags) \ + pagebuf_get((target), (blkno), (len), \ + PBF_MAPPABLE | flags) + +static inline int xfs_bawrite(void *mp, page_buf_t *bp) +{ + extern int xfs_bdstrat_cb(struct xfs_buf *); + int ret; + + bp->pb_fspriv3 = mp; + bp->pb_strat = xfs_bdstrat_cb; + xfs_buf_undelay(bp); + if ((ret = pagebuf_iostart(bp, PBF_WRITE | PBF_ASYNC)) == 0) + run_task_queue(&tq_disk); + return ret; +} + +static inline void xfs_buf_relse(page_buf_t *bp) +{ + if ((bp->pb_flags & _PBF_LOCKABLE) && !bp->pb_relse) + pagebuf_unlock(bp); + + pagebuf_rele(bp); +} + + +#define xfs_bpin(bp) pagebuf_pin(bp) +#define xfs_bunpin(bp) pagebuf_unpin(bp) + +#ifdef PAGEBUF_TRACE +# define PB_DEFINE_TRACES +# include +# define xfs_buftrace(id, bp) PB_TRACE(bp, PB_TRACE_REC(external), (void *)id) +#else +# define xfs_buftrace(id, bp) do { } while (0) +#endif + + +#define xfs_biodone(pb) \ + pagebuf_iodone(pb) + +#define xfs_incore(buftarg,blkno,len,lockit) \ + pagebuf_find(buftarg, blkno ,len, lockit) + + +#define xfs_biomove(pb, off, len, data, rw) \ + pagebuf_iomove((pb), (off), (len), (data), \ + ((rw) == XFS_B_WRITE) ? PBRW_WRITE : PBRW_READ) + +#define xfs_biozero(pb, off, len) \ + pagebuf_iomove((pb), (off), (len), NULL, PBRW_ZERO) + + +static inline int XFS_bwrite(page_buf_t *pb) +{ + int sync = (pb->pb_flags & PBF_ASYNC) == 0; + int error; + + pb->pb_flags |= PBF_SYNC; + + xfs_buf_undelay(pb); + + __pagebuf_iorequest(pb); + + if (sync) { + error = pagebuf_iowait(pb); + xfs_buf_relse(pb); + } else { + run_task_queue(&tq_disk); + error = 0; + } + + return error; +} + + +#define XFS_bdwrite(pb) \ + pagebuf_iostart(pb, PBF_DELWRI | PBF_ASYNC) + +static inline int xfs_bdwrite(void *mp, page_buf_t *bp) +{ + extern int xfs_bdstrat_cb(struct xfs_buf *); + + bp->pb_strat = xfs_bdstrat_cb; + bp->pb_fspriv3 = mp; + + return pagebuf_iostart(bp, PBF_DELWRI | PBF_ASYNC); +} + +#define XFS_bdstrat(bp) pagebuf_iorequest(bp) + +#define xfs_iowait(pb) pagebuf_iowait(pb) + + +/* + * Go through all incore buffers, and release buffers + * if they belong to the given device. This is used in + * filesystem error handling to preserve the consistency + * of its metadata. + */ + +extern void XFS_bflush(xfs_buftarg_t *); +#define xfs_binval(buftarg) XFS_bflush(buftarg) + +#define xfs_incore_relse(buftarg,delwri_only,wait) \ + pagebuf_target_clear(buftarg) + + +#define xfs_baread(target, rablkno, ralen) \ + pagebuf_readahead((target), (rablkno), \ + (ralen), PBF_DONT_BLOCK) + +#define XFS_getrbuf(sleep,mp) \ + pagebuf_get_empty((mp)->m_ddev_targp) +#define XFS_ngetrbuf(len,mp) \ + pagebuf_get_no_daddr(len,(mp)->m_ddev_targp) +#define XFS_freerbuf(bp) pagebuf_free(bp) +#define XFS_nfreerbuf(bp) pagebuf_free(bp) + +#endif diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_buf_item.c linux-2.4-xfs/fs/xfs/xfs_buf_item.c --- linux-2.4.19/fs/xfs/xfs_buf_item.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_buf_item.c Thu Aug 1 01:28:15 2002 @@ -0,0 +1,1203 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * This file contains the implementation of the xfs_buf_log_item. + * It contains the item operations used to manipulate the buf log + * items as well as utility routines used by the buffer specific + * transaction routines. + */ + +#include + + +#define ROUNDUPNBWORD(x) (((x) + (NBWORD - 1)) & ~(NBWORD - 1)) + +kmem_zone_t *xfs_buf_item_zone; + +#ifdef XFS_TRANS_DEBUG +/* + * This function uses an alternate strategy for tracking the bytes + * that the user requests to be logged. This can then be used + * in conjunction with the bli_orig array in the buf log item to + * catch bugs in our callers' code. + * + * We also double check the bits set in xfs_buf_item_log using a + * simple algorithm to check that every byte is accounted for. + */ +STATIC void +xfs_buf_item_log_debug( + xfs_buf_log_item_t *bip, + uint first, + uint last) +{ + uint x; + uint byte; + uint nbytes; + uint chunk_num; + uint word_num; + uint bit_num; + uint bit_set; + uint *wordp; + + ASSERT(bip->bli_logged != NULL); + byte = first; + nbytes = last - first + 1; + bfset(bip->bli_logged, first, nbytes); + for (x = 0; x < nbytes; x++) { + chunk_num = byte >> XFS_BLI_SHIFT; + word_num = chunk_num >> BIT_TO_WORD_SHIFT; + bit_num = chunk_num & (NBWORD - 1); + wordp = &(bip->bli_format.blf_data_map[word_num]); + bit_set = *wordp & (1 << bit_num); + ASSERT(bit_set); + byte++; + } +} + +/* + * This function is called when we flush something into a buffer without + * logging it. This happens for things like inodes which are logged + * separately from the buffer. + */ +void +xfs_buf_item_flush_log_debug( + xfs_buf_t *bp, + uint first, + uint last) +{ + xfs_buf_log_item_t *bip; + uint nbytes; + + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); + if ((bip == NULL) || (bip->bli_item.li_type != XFS_LI_BUF)) { + return; + } + + ASSERT(bip->bli_logged != NULL); + nbytes = last - first + 1; + bfset(bip->bli_logged, first, nbytes); +} + +/* + * This function is called to verify that our caller's have logged + * all the bytes that they changed. + * + * It does this by comparing the original copy of the buffer stored in + * the buf log item's bli_orig array to the current copy of the buffer + * and ensuring that all bytes which miscompare are set in the bli_logged + * array of the buf log item. + */ +STATIC void +xfs_buf_item_log_check( + xfs_buf_log_item_t *bip) +{ + char *orig; + char *buffer; + int x; + xfs_buf_t *bp; + + ASSERT(bip->bli_orig != NULL); + ASSERT(bip->bli_logged != NULL); + + bp = bip->bli_buf; + ASSERT(XFS_BUF_COUNT(bp) > 0); + ASSERT(XFS_BUF_PTR(bp) != NULL); + orig = bip->bli_orig; + buffer = XFS_BUF_PTR(bp); + for (x = 0; x < XFS_BUF_COUNT(bp); x++) { + if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) + cmn_err(CE_PANIC, + "xfs_buf_item_log_check bip %x buffer %x orig %x index %d", + bip, bp, orig, x); + } +} +#else +#define xfs_buf_item_log_debug(x,y,z) +#define xfs_buf_item_log_check(x) +#endif + +STATIC void xfs_buf_error_relse(xfs_buf_t *bp); + +/* + * This returns the number of log iovecs needed to log the + * given buf log item. + * + * It calculates this as 1 iovec for the buf log format structure + * and 1 for each stretch of non-contiguous chunks to be logged. + * Contiguous chunks are logged in a single iovec. + * + * If the XFS_BLI_STALE flag has been set, then log nothing. + */ +uint +xfs_buf_item_size( + xfs_buf_log_item_t *bip) +{ + uint nvecs; + int next_bit; + int last_bit; + xfs_buf_t *bp; + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + if (bip->bli_flags & XFS_BLI_STALE) { + /* + * The buffer is stale, so all we need to log + * is the buf log format structure with the + * cancel flag in it. + */ + xfs_buf_item_trace("SIZE STALE", bip); + ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + return 1; + } + + bp = bip->bli_buf; + ASSERT(bip->bli_flags & XFS_BLI_LOGGED); + nvecs = 1; + last_bit = xfs_next_bit(bip->bli_format.blf_data_map, + bip->bli_format.blf_map_size, 0); + ASSERT(last_bit != -1); + nvecs++; + while (last_bit != -1) { + /* + * This takes the bit number to start looking from and + * returns the next set bit from there. It returns -1 + * if there are no more bits set or the start bit is + * beyond the end of the bitmap. + */ + next_bit = xfs_next_bit(bip->bli_format.blf_data_map, + bip->bli_format.blf_map_size, + last_bit + 1); + /* + * If we run out of bits, leave the loop, + * else if we find a new set of bits bump the number of vecs, + * else keep scanning the current set of bits. + */ + if (next_bit == -1) { + last_bit = -1; + } else if (next_bit != last_bit + 1) { + last_bit = next_bit; + nvecs++; + } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) != + (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) + + XFS_BLI_CHUNK)) { + last_bit = next_bit; + nvecs++; + } else { + last_bit++; + } + } + + xfs_buf_item_trace("SIZE NORM", bip); + return nvecs; +} + +/* + * This is called to fill in the vector of log iovecs for the + * given log buf item. It fills the first entry with a buf log + * format structure, and the rest point to contiguous chunks + * within the buffer. + */ +void +xfs_buf_item_format( + xfs_buf_log_item_t *bip, + xfs_log_iovec_t *log_vector) +{ + uint base_size; + uint nvecs; + xfs_log_iovec_t *vecp; + xfs_buf_t *bp; + int first_bit; + int last_bit; + int next_bit; + uint nbits; + uint buffer_offset; + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || + (bip->bli_flags & XFS_BLI_STALE)); + bp = bip->bli_buf; + ASSERT(XFS_BUF_BP_ISMAPPED(bp)); + vecp = log_vector; + + /* + * The size of the base structure is the size of the + * declared structure plus the space for the extra words + * of the bitmap. We subtract one from the map size, because + * the first element of the bitmap is accounted for in the + * size of the base structure. + */ + base_size = + (uint)(sizeof(xfs_buf_log_format_t) + + ((bip->bli_format.blf_map_size - 1) * sizeof(uint))); + vecp->i_addr = (xfs_caddr_t)&bip->bli_format; + vecp->i_len = base_size; + vecp++; + nvecs = 1; + + if (bip->bli_flags & XFS_BLI_STALE) { + /* + * The buffer is stale, so all we need to log + * is the buf log format structure with the + * cancel flag in it. + */ + xfs_buf_item_trace("FORMAT STALE", bip); + ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + bip->bli_format.blf_size = nvecs; + return; + } + + /* + * Fill in an iovec for each set of contiguous chunks. + */ + first_bit = xfs_next_bit(bip->bli_format.blf_data_map, + bip->bli_format.blf_map_size, 0); + ASSERT(first_bit != -1); + last_bit = first_bit; + nbits = 1; + for (;;) { + /* + * This takes the bit number to start looking from and + * returns the next set bit from there. It returns -1 + * if there are no more bits set or the start bit is + * beyond the end of the bitmap. + */ + next_bit = xfs_next_bit(bip->bli_format.blf_data_map, + bip->bli_format.blf_map_size, + (uint)last_bit + 1); + /* + * If we run out of bits fill in the last iovec and get + * out of the loop. + * Else if we start a new set of bits then fill in the + * iovec for the series we were looking at and start + * counting the bits in the new one. + * Else we're still in the same set of bits so just + * keep counting and scanning. + */ + if (next_bit == -1) { + buffer_offset = first_bit * XFS_BLI_CHUNK; + vecp->i_addr = xfs_buf_offset(bp, buffer_offset); + vecp->i_len = nbits * XFS_BLI_CHUNK; + nvecs++; + break; + } else if (next_bit != last_bit + 1) { + buffer_offset = first_bit * XFS_BLI_CHUNK; + vecp->i_addr = xfs_buf_offset(bp, buffer_offset); + vecp->i_len = nbits * XFS_BLI_CHUNK; + nvecs++; + vecp++; + first_bit = next_bit; + last_bit = next_bit; + nbits = 1; + } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) != + (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) + + XFS_BLI_CHUNK)) { + buffer_offset = first_bit * XFS_BLI_CHUNK; + vecp->i_addr = xfs_buf_offset(bp, buffer_offset); + vecp->i_len = nbits * XFS_BLI_CHUNK; +/* You would think we need to bump the nvecs here too, but we do not + * this number is used by recovery, and it gets confused by the boundary + * split here + * nvecs++; + */ + vecp++; + first_bit = next_bit; + last_bit = next_bit; + } else { + last_bit++; + nbits++; + } + } + bip->bli_format.blf_size = nvecs; + + /* + * Check to make sure everything is consistent. + */ + xfs_buf_item_trace("FORMAT NORM", bip); + xfs_buf_item_log_check(bip); +} + +/* + * This is called to pin the buffer associated with the buf log + * item in memory so it cannot be written out. Simply call bpin() + * on the buffer to do this. + */ +void +xfs_buf_item_pin( + xfs_buf_log_item_t *bip) +{ + xfs_buf_t *bp; + + bp = bip->bli_buf; + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || + (bip->bli_flags & XFS_BLI_STALE)); + xfs_buf_item_trace("PIN", bip); + xfs_buftrace("XFS_PIN", bp); + xfs_bpin(bp); +} + + +/* + * This is called to unpin the buffer associated with the buf log + * item which was previously pinned with a call to xfs_buf_item_pin(). + * Just call bunpin() on the buffer to do this. + * + * Also drop the reference to the buf item for the current transaction. + * If the XFS_BLI_STALE flag is set and we are the last reference, + * then free up the buf log item and unlock the buffer. + */ +void +xfs_buf_item_unpin( + xfs_buf_log_item_t *bip) +{ + xfs_mount_t *mp; + xfs_buf_t *bp; + int freed; + SPLDECL(s); + + bp = bip->bli_buf; + ASSERT(bp != NULL); + ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + xfs_buf_item_trace("UNPIN", bip); + xfs_buftrace("XFS_UNPIN", bp); + + freed = atomic_dec_and_test(&bip->bli_refcount); + mp = bip->bli_item.li_mountp; + xfs_bunpin(bp); + if (freed && (bip->bli_flags & XFS_BLI_STALE)) { + ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); + ASSERT(XFS_BUF_ISSTALE(bp)); +/** + ASSERT(bp->b_pincount == 0); +**/ + ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + xfs_buf_item_trace("UNPIN STALE", bip); + xfs_buftrace("XFS_UNPIN STALE", bp); + AIL_LOCK(mp,s); + /* + * If we get called here because of an IO error, we may + * or may not have the item on the AIL. xfs_trans_delete_ail() + * will take care of that situation. + * xfs_trans_delete_ail() drops the AIL lock. + */ + xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip, s); + xfs_buf_item_relse(bp); + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL); + xfs_buf_relse(bp); + } + +} + +/* + * this is called from uncommit in the forced-shutdown path. + * we need to check to see if the reference count on the log item + * is going to drop to zero. If so, unpin will free the log item + * so we need to free the item's descriptor (that points to the item) + * in the transaction. + */ +void +xfs_buf_item_unpin_remove( + xfs_buf_log_item_t *bip, + xfs_trans_t *tp) +{ + xfs_buf_t *bp; + xfs_log_item_desc_t *lidp; + + bp = bip->bli_buf; + /* + * will xfs_buf_item_unpin() call xfs_buf_item_relse()? + */ + if ((atomic_read(&bip->bli_refcount) == 1) && + (bip->bli_flags & XFS_BLI_STALE)) { + ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0); + xfs_buf_item_trace("UNPIN REMOVE", bip); + xfs_buftrace("XFS_UNPIN_REMOVE", bp); + /* + * yes -- clear the xaction descriptor in-use flag + * and free the chunk if required. We can safely + * do some work here and then call buf_item_unpin + * to do the rest because if the if is true, then + * we are holding the buffer locked so no one else + * will be able to bump up the refcount. + */ + lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip); + xfs_trans_free_item(tp, lidp); + /* + * Since the transaction no longer refers to the buffer, + * the buffer should no longer refer to the transaction. + */ + XFS_BUF_SET_FSPRIVATE2(bp, NULL); + } + + xfs_buf_item_unpin(bip); + + return; +} + +/* + * This is called to attempt to lock the buffer associated with this + * buf log item. Don't sleep on the buffer lock. If we can't get + * the lock right away, return 0. If we can get the lock, pull the + * buffer from the free list, mark it busy, and return 1. + */ +uint +xfs_buf_item_trylock( + xfs_buf_log_item_t *bip) +{ + xfs_buf_t *bp; + + bp = bip->bli_buf; + + if (XFS_BUF_ISPINNED(bp)) { + return XFS_ITEM_PINNED; + } + + if (!XFS_BUF_CPSEMA(bp)) { + return XFS_ITEM_LOCKED; + } + + /* + * Remove the buffer from the free list. Only do this + * if it's on the free list. Private buffers like the + * superblock buffer are not. + */ + XFS_BUF_HOLD(bp); + + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + xfs_buf_item_trace("TRYLOCK SUCCESS", bip); + return XFS_ITEM_SUCCESS; +} + +/* + * Release the buffer associated with the buf log item. + * If there is no dirty logged data associated with the + * buffer recorded in the buf log item, then free the + * buf log item and remove the reference to it in the + * buffer. + * + * This call ignores the recursion count. It is only called + * when the buffer should REALLY be unlocked, regardless + * of the recursion count. + * + * If the XFS_BLI_HOLD flag is set in the buf log item, then + * free the log item if necessary but do not unlock the buffer. + * This is for support of xfs_trans_bhold(). Make sure the + * XFS_BLI_HOLD field is cleared if we don't free the item. + */ +void +xfs_buf_item_unlock( + xfs_buf_log_item_t *bip) +{ + int aborted; + xfs_buf_t *bp; + uint hold; + + bp = bip->bli_buf; + xfs_buftrace("XFS_UNLOCK", bp); + + /* + * Clear the buffer's association with this transaction. + */ + XFS_BUF_SET_FSPRIVATE2(bp, NULL); + + /* + * If this is a transaction abort, don't return early. + * Instead, allow the brelse to happen. + * Normally it would be done for stale (cancelled) buffers + * at unpin time, but we'll never go through the pin/unpin + * cycle if we abort inside commit. + */ + aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0; + + /* + * If the buf item is marked stale, then don't do anything. + * We'll unlock the buffer and free the buf item when the + * buffer is unpinned for the last time. + */ + if (bip->bli_flags & XFS_BLI_STALE) { + bip->bli_flags &= ~XFS_BLI_LOGGED; + xfs_buf_item_trace("UNLOCK STALE", bip); + ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + if (!aborted) + return; + } + + /* + * Drop the transaction's reference to the log item if + * it was not logged as part of the transaction. Otherwise + * we'll drop the reference in xfs_buf_item_unpin() when + * the transaction is really through with the buffer. + */ + if (!(bip->bli_flags & XFS_BLI_LOGGED)) { + atomic_dec(&bip->bli_refcount); + } else { + /* + * Clear the logged flag since this is per + * transaction state. + */ + bip->bli_flags &= ~XFS_BLI_LOGGED; + } + + /* + * Before possibly freeing the buf item, determine if we should + * release the buffer at the end of this routine. + */ + hold = bip->bli_flags & XFS_BLI_HOLD; + xfs_buf_item_trace("UNLOCK", bip); + + /* + * If the buf item isn't tracking any data, free it. + * Otherwise, if XFS_BLI_HOLD is set clear it. + */ + if (xfs_count_bits(bip->bli_format.blf_data_map, + bip->bli_format.blf_map_size, 0) == 0) { + xfs_buf_item_relse(bp); + } else if (hold) { + bip->bli_flags &= ~XFS_BLI_HOLD; + } + + /* + * Release the buffer if XFS_BLI_HOLD was not set. + */ + if (!hold) { + xfs_buf_relse(bp); + } +} + +/* + * This is called to find out where the oldest active copy of the + * buf log item in the on disk log resides now that the last log + * write of it completed at the given lsn. + * We always re-log all the dirty data in a buffer, so usually the + * latest copy in the on disk log is the only one that matters. For + * those cases we simply return the given lsn. + * + * The one exception to this is for buffers full of newly allocated + * inodes. These buffers are only relogged with the XFS_BLI_INODE_BUF + * flag set, indicating that only the di_next_unlinked fields from the + * inodes in the buffers will be replayed during recovery. If the + * original newly allocated inode images have not yet been flushed + * when the buffer is so relogged, then we need to make sure that we + * keep the old images in the 'active' portion of the log. We do this + * by returning the original lsn of that transaction here rather than + * the current one. + */ +xfs_lsn_t +xfs_buf_item_committed( + xfs_buf_log_item_t *bip, + xfs_lsn_t lsn) +{ + xfs_buf_item_trace("COMMITTED", bip); + if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && + (bip->bli_item.li_lsn != 0)) { + return bip->bli_item.li_lsn; + } + return (lsn); +} + +/* + * This is called when the transaction holding the buffer is aborted. + * Just behave as if the transaction had been cancelled. If we're shutting down + * and have aborted this transaction, we'll trap this buffer when it tries to + * get written out. + */ +void +xfs_buf_item_abort( + xfs_buf_log_item_t *bip) +{ + xfs_buf_t *bp; + + bp = bip->bli_buf; + xfs_buftrace("XFS_ABORT", bp); + XFS_BUF_SUPER_STALE(bp); + xfs_buf_item_unlock(bip); + return; +} + +/* + * This is called to asynchronously write the buffer associated with this + * buf log item out to disk. The buffer will already have been locked by + * a successful call to xfs_buf_item_trylock(). If the buffer still has + * B_DELWRI set, then get it going out to disk with a call to bawrite(). + * If not, then just release the buffer. + */ +void +xfs_buf_item_push( + xfs_buf_log_item_t *bip) +{ + xfs_buf_t *bp; + + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + xfs_buf_item_trace("PUSH", bip); + + bp = bip->bli_buf; + + if (XFS_BUF_ISDELAYWRITE(bp)) { + xfs_bawrite(bip->bli_item.li_mountp, bp); + } else { + xfs_buf_relse(bp); + } +} + +/* ARGSUSED */ +void +xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn) +{ +} + +/* + * This is the ops vector shared by all buf log items. + */ +struct xfs_item_ops xfs_buf_item_ops = { + .iop_size = (uint(*)(xfs_log_item_t*))xfs_buf_item_size, + .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) + xfs_buf_item_format, + .iop_pin = (void(*)(xfs_log_item_t*))xfs_buf_item_pin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_buf_item_unpin, + .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) + xfs_buf_item_unpin_remove, + .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock, + .iop_unlock = (void(*)(xfs_log_item_t*))xfs_buf_item_unlock, + .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) + xfs_buf_item_committed, + .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push, + .iop_abort = (void(*)(xfs_log_item_t*))xfs_buf_item_abort, + .iop_pushbuf = NULL, + .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) + xfs_buf_item_committing +}; + + +/* + * Allocate a new buf log item to go with the given buffer. + * Set the buffer's b_fsprivate field to point to the new + * buf log item. If there are other item's attached to the + * buffer (see xfs_buf_attach_iodone() below), then put the + * buf log item at the front. + */ +void +xfs_buf_item_init( + xfs_buf_t *bp, + xfs_mount_t *mp) +{ + xfs_log_item_t *lip; + xfs_buf_log_item_t *bip; + int chunks; + int map_size; + + /* + * Check to see if there is already a buf log item for + * this buffer. If there is, it is guaranteed to be + * the first. If we do already have one, there is + * nothing to do here so return. + */ + if (XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *) != mp) + XFS_BUF_SET_FSPRIVATE3(bp, mp); + XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb); + if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { + lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + if (lip->li_type == XFS_LI_BUF) { + return; + } + } + + /* + * chunks is the number of XFS_BLI_CHUNK size pieces + * the buffer can be divided into. Make sure not to + * truncate any pieces. map_size is the size of the + * bitmap needed to describe the chunks of the buffer. + */ + chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT); + map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); + + bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, + KM_SLEEP); + bip->bli_item.li_type = XFS_LI_BUF; + bip->bli_item.li_ops = &xfs_buf_item_ops; + bip->bli_item.li_mountp = mp; + bip->bli_buf = bp; + bip->bli_format.blf_type = XFS_LI_BUF; + bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); + bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp)); + bip->bli_format.blf_map_size = map_size; +#ifdef XFS_BLI_TRACE + bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_SLEEP); +#endif + +#ifdef XFS_TRANS_DEBUG + /* + * Allocate the arrays for tracking what needs to be logged + * and what our callers request to be logged. bli_orig + * holds a copy of the original, clean buffer for comparison + * against, and bli_logged keeps a 1 bit flag per byte in + * the buffer to indicate which bytes the callers have asked + * to have logged. + */ + bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP); + bcopy(XFS_BUF_PTR(bp), bip->bli_orig, XFS_BUF_COUNT(bp)); + bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP); +#endif + + /* + * Put the buf item into the list of items attached to the + * buffer at the front. + */ + if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { + bip->bli_item.li_bio_list = + XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + } + XFS_BUF_SET_FSPRIVATE(bp, bip); +} + + +/* + * Mark bytes first through last inclusive as dirty in the buf + * item's bitmap. + */ +void +xfs_buf_item_log( + xfs_buf_log_item_t *bip, + uint first, + uint last) +{ + uint first_bit; + uint last_bit; + uint bits_to_set; + uint bits_set; + uint word_num; + uint *wordp; + uint bit; + uint end_bit; + uint mask; + + /* + * Mark the item as having some dirty data for + * quick reference in xfs_buf_item_dirty. + */ + bip->bli_flags |= XFS_BLI_DIRTY; + + /* + * Convert byte offsets to bit numbers. + */ + first_bit = first >> XFS_BLI_SHIFT; + last_bit = last >> XFS_BLI_SHIFT; + + /* + * Calculate the total number of bits to be set. + */ + bits_to_set = last_bit - first_bit + 1; + + /* + * Get a pointer to the first word in the bitmap + * to set a bit in. + */ + word_num = first_bit >> BIT_TO_WORD_SHIFT; + wordp = &(bip->bli_format.blf_data_map[word_num]); + + /* + * Calculate the starting bit in the first word. + */ + bit = first_bit & (uint)(NBWORD - 1); + + /* + * First set any bits in the first word of our range. + * If it starts at bit 0 of the word, it will be + * set below rather than here. That is what the variable + * bit tells us. The variable bits_set tracks the number + * of bits that have been set so far. End_bit is the number + * of the last bit to be set in this word plus one. + */ + if (bit) { + end_bit = MIN(bit + bits_to_set, (uint)NBWORD); + mask = ((1 << (end_bit - bit)) - 1) << bit; + *wordp |= mask; + wordp++; + bits_set = end_bit - bit; + } else { + bits_set = 0; + } + + /* + * Now set bits a whole word at a time that are between + * first_bit and last_bit. + */ + while ((bits_to_set - bits_set) >= NBWORD) { + *wordp |= 0xffffffff; + bits_set += NBWORD; + wordp++; + } + + /* + * Finally, set any bits left to be set in one last partial word. + */ + end_bit = bits_to_set - bits_set; + if (end_bit) { + mask = (1 << end_bit) - 1; + *wordp |= mask; + } + + xfs_buf_item_log_debug(bip, first, last); +} + + +/* + * Return 1 if the buffer has some data that has been logged (at any + * point, not just the current transaction) and 0 if not. + */ +uint +xfs_buf_item_dirty( + xfs_buf_log_item_t *bip) +{ + return (bip->bli_flags & XFS_BLI_DIRTY); +} + +/* + * This is called when the buf log item is no longer needed. It should + * free the buf log item associated with the given buffer and clear + * the buffer's pointer to the buf log item. If there are no more + * items in the list, clear the b_iodone field of the buffer (see + * xfs_buf_attach_iodone() below). + */ +void +xfs_buf_item_relse( + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip; + + xfs_buftrace("XFS_RELSE", bp); + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); + XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list); + if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) && + (XFS_BUF_IODONE_FUNC(bp) != NULL)) { +/** + ASSERT((XFS_BUF_ISUNINITIAL(bp)) == 0); +***/ + XFS_BUF_CLR_IODONE_FUNC(bp); + } + +#ifdef XFS_TRANS_DEBUG + kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp)); + bip->bli_orig = NULL; + kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY); + bip->bli_logged = NULL; +#endif /* XFS_TRANS_DEBUG */ + +#ifdef XFS_BLI_TRACE + ktrace_free(bip->bli_trace); +#endif + kmem_zone_free(xfs_buf_item_zone, bip); +} + + +/* + * Add the given log item with it's callback to the list of callbacks + * to be called when the buffer's I/O completes. If it is not set + * already, set the buffer's b_iodone() routine to be + * xfs_buf_iodone_callbacks() and link the log item into the list of + * items rooted at b_fsprivate. Items are always added as the second + * entry in the list if there is a first, because the buf item code + * assumes that the buf log item is first. + */ +void +xfs_buf_attach_iodone( + xfs_buf_t *bp, + void (*cb)(xfs_buf_t *, xfs_log_item_t *), + xfs_log_item_t *lip) +{ + xfs_log_item_t *head_lip; + + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + + lip->li_cb = cb; + if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { + head_lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + lip->li_bio_list = head_lip->li_bio_list; + head_lip->li_bio_list = lip; + } else { + XFS_BUF_SET_FSPRIVATE(bp, lip); + } + + ASSERT((XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks) || + (XFS_BUF_IODONE_FUNC(bp) == NULL)); + XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); +} + +STATIC void +xfs_buf_do_callbacks( + xfs_buf_t *bp, + xfs_log_item_t *lip) +{ + xfs_log_item_t *nlip; + + while (lip != NULL) { + nlip = lip->li_bio_list; + ASSERT(lip->li_cb != NULL); + /* + * Clear the next pointer so we don't have any + * confusion if the item is added to another buf. + * Don't touch the log item after calling its + * callback, because it could have freed itself. + */ + lip->li_bio_list = NULL; + lip->li_cb(bp, lip); + lip = nlip; + } +} + +/* + * This is the iodone() function for buffers which have had callbacks + * attached to them by xfs_buf_attach_iodone(). It should remove each + * log item from the buffer's list and call the callback of each in turn. + * When done, the buffer's fsprivate field is set to NULL and the buffer + * is unlocked with a call to iodone(). + */ +void +xfs_buf_iodone_callbacks( + xfs_buf_t *bp) +{ + xfs_log_item_t *lip; + static time_t lasttime; + static dev_t lastdev; + xfs_mount_t *mp; + + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + + if (XFS_BUF_GETERROR(bp) != 0) { + /* + * If we've already decided to shutdown the filesystem + * because of IO errors, there's no point in giving this + * a retry. + */ + mp = lip->li_mountp; + if (XFS_FORCED_SHUTDOWN(mp)) { + ASSERT(XFS_BUF_TARGET_DEV(bp) == mp->m_dev); + XFS_BUF_SUPER_STALE(bp); + xfs_buftrace("BUF_IODONE_CB", bp); + xfs_buf_do_callbacks(bp, lip); + XFS_BUF_SET_FSPRIVATE(bp, NULL); + XFS_BUF_CLR_IODONE_FUNC(bp); + + /* + * XFS_SHUT flag gets set when we go thru the + * entire buffer cache and deliberately start + * throwing away delayed write buffers. + * Since there's no biowait done on those, + * we should just brelse them. + */ + if (XFS_BUF_ISSHUT(bp)) { + XFS_BUF_UNSHUT(bp); + xfs_buf_relse(bp); + } else { + xfs_biodone(bp); + } + + return; + } + + if ((XFS_BUF_TARGET_DEV(bp) != lastdev) || + ((lbolt - lasttime) > 500)) { + prdev("XFS write error in file system meta-data " + "block 0x%Lx in %s", + XFS_BUF_TARGET_DEV(bp), + XFS_BUF_ADDR(bp), mp->m_fsname); + lasttime = lbolt; + } + lastdev = XFS_BUF_TARGET_DEV(bp); + + if (XFS_BUF_ISASYNC(bp)) { + /* + * If the write was asynchronous then noone will be + * looking for the error. Clear the error state + * and write the buffer out again delayed write. + * + * XXXsup This is OK, so long as we catch these + * before we start the umount; we don't want these + * DELWRI metadata bufs to be hanging around. + */ + XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */ + + if (!(XFS_BUF_ISSTALE(bp))) { + XFS_BUF_DELAYWRITE(bp); + XFS_BUF_DONE(bp); + XFS_BUF_SET_START(bp); + } + ASSERT(XFS_BUF_IODONE_FUNC(bp)); + xfs_buftrace("BUF_IODONE ASYNC", bp); + xfs_buf_relse(bp); + } else { + /* + * If the write of the buffer was not asynchronous, + * then we want to make sure to return the error + * to the caller of bwrite(). Because of this we + * cannot clear the B_ERROR state at this point. + * Instead we install a callback function that + * will be called when the buffer is released, and + * that routine will clear the error state and + * set the buffer to be written out again after + * some delay. + */ + /* We actually overwrite the existing b-relse + function at times, but we're gonna be shutting down + anyway. */ + XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse); + XFS_BUF_DONE(bp); + XFS_BUF_V_IODONESEMA(bp); + } + return; + } +#ifdef XFSERRORDEBUG + xfs_buftrace("XFS BUFCB NOERR", bp); +#endif + xfs_buf_do_callbacks(bp, lip); + XFS_BUF_SET_FSPRIVATE(bp, NULL); + XFS_BUF_CLR_IODONE_FUNC(bp); + xfs_biodone(bp); +} + +/* + * This is a callback routine attached to a buffer which gets an error + * when being written out synchronously. + */ +STATIC void +xfs_buf_error_relse( + xfs_buf_t *bp) +{ + xfs_log_item_t *lip; + xfs_mount_t *mp; + + lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + mp = (xfs_mount_t *)lip->li_mountp; + ASSERT(XFS_BUF_TARGET_DEV(bp) == mp->m_dev); + + XFS_BUF_STALE(bp); + XFS_BUF_DONE(bp); + XFS_BUF_UNDELAYWRITE(bp); + XFS_BUF_ERROR(bp,0); + xfs_buftrace("BUF_ERROR_RELSE", bp); + if (! XFS_FORCED_SHUTDOWN(mp)) + xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR); + /* + * We have to unpin the pinned buffers so do the + * callbacks. + */ + xfs_buf_do_callbacks(bp, lip); + XFS_BUF_SET_FSPRIVATE(bp, NULL); + XFS_BUF_CLR_IODONE_FUNC(bp); + XFS_BUF_SET_BRELSE_FUNC(bp,NULL); + xfs_buf_relse(bp); +} + + +/* + * This is the iodone() function for buffers which have been + * logged. It is called when they are eventually flushed out. + * It should remove the buf item from the AIL, and free the buf item. + * It is called by xfs_buf_iodone_callbacks() above which will take + * care of cleaning up the buffer itself. + */ +/* ARGSUSED */ +void +xfs_buf_iodone( + xfs_buf_t *bp, + xfs_buf_log_item_t *bip) +{ + struct xfs_mount *mp; + SPLDECL(s); + + ASSERT(bip->bli_buf == bp); + + mp = bip->bli_item.li_mountp; + + /* + * If we are forcibly shutting down, this may well be + * off the AIL already. That's because we simulate the + * log-committed callbacks to unpin these buffers. Or we may never + * have put this item on AIL because of the transaction was + * aborted forcibly. xfs_trans_delete_ail() takes care of these. + * + * Either way, AIL is useless if we're forcing a shutdown. + */ + AIL_LOCK(mp,s); + /* + * xfs_trans_delete_ail() drops the AIL lock. + */ + xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip, s); + +#ifdef XFS_TRANS_DEBUG + kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp)); + bip->bli_orig = NULL; + kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY); + bip->bli_logged = NULL; +#endif /* XFS_TRANS_DEBUG */ + +#ifdef XFS_BLI_TRACE + ktrace_free(bip->bli_trace); +#endif + kmem_zone_free(xfs_buf_item_zone, bip); +} + +#if defined(XFS_BLI_TRACE) +void +xfs_buf_item_trace( + char *id, + xfs_buf_log_item_t *bip) +{ + xfs_buf_t *bp; + ASSERT(bip->bli_trace != NULL); + + bp = bip->bli_buf; + ktrace_enter(bip->bli_trace, + (void *)id, + (void *)bip->bli_buf, + (void *)((unsigned long)bip->bli_flags), + (void *)((unsigned long)bip->bli_recur), + (void *)((unsigned long)atomic_read(&bip->bli_refcount)), + (void *)XFS_BUF_ADDR(bp), + (void *)((unsigned long)XFS_BUF_COUNT(bp)), + (void *)((unsigned long)(0xFFFFFFFF & (XFS_BFLAGS(bp) >> 32))), + (void *)((unsigned long)(0xFFFFFFFF & XFS_BFLAGS(bp))), + XFS_BUF_FSPRIVATE(bp, void *), + XFS_BUF_FSPRIVATE2(bp, void *), + (void *)((unsigned long)bp->b_pincount), + (void *)XFS_BUF_IODONE_FUNC(bp), + (void *)((unsigned long)(XFS_BUF_VALUSEMA(bp))), + (void *)bip->bli_item.li_desc, + (void *)((unsigned long)bip->bli_item.li_flags)); +} +#endif /* XFS_BLI_TRACE */ + + diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_buf_item.h linux-2.4-xfs/fs/xfs/xfs_buf_item.h --- linux-2.4.19/fs/xfs/xfs_buf_item.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_buf_item.h Thu Aug 1 01:28:15 2002 @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_BUF_ITEM_H__ +#define __XFS_BUF_ITEM_H__ + +/* + * This is the structure used to lay out a buf log item in the + * log. The data map describes which 128 byte chunks of the buffer + * have been logged. This structure works only on buffers that + * reside up to the first TB in the filesystem. These buffers are + * generated only by pre-6.2 systems and are known as XFS_LI_6_1_BUF. + */ +typedef struct xfs_buf_log_format_v1 { + unsigned short blf_type; /* buf log item type indicator */ + unsigned short blf_size; /* size of this item */ + __int32_t blf_blkno; /* starting blkno of this buf */ + ushort blf_flags; /* misc state */ + ushort blf_len; /* number of blocks in this buf */ + unsigned int blf_map_size; /* size of data bitmap in words */ + unsigned int blf_data_map[1];/* variable size bitmap of */ + /* regions of buffer in this item */ +} xfs_buf_log_format_v1_t; + +/* + * This is a form of the above structure with a 64 bit blkno field. + * For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything. + */ +typedef struct xfs_buf_log_format_t { + unsigned short blf_type; /* buf log item type indicator */ + unsigned short blf_size; /* size of this item */ + ushort blf_flags; /* misc state */ + ushort blf_len; /* number of blocks in this buf */ + __int64_t blf_blkno; /* starting blkno of this buf */ + unsigned int blf_map_size; /* size of data bitmap in words */ + unsigned int blf_data_map[1];/* variable size bitmap of */ + /* regions of buffer in this item */ +} xfs_buf_log_format_t; + +/* + * This flag indicates that the buffer contains on disk inodes + * and requires special recovery handling. + */ +#define XFS_BLI_INODE_BUF 0x1 +/* + * This flag indicates that the buffer should not be replayed + * during recovery because its blocks are being freed. + */ +#define XFS_BLI_CANCEL 0x2 +/* + * This flag indicates that the buffer contains on disk + * user or group dquots and may require special recovery handling. + */ +#define XFS_BLI_UDQUOT_BUF 0x4 +/* #define XFS_BLI_PDQUOT_BUF 0x8 */ +#define XFS_BLI_GDQUOT_BUF 0x10 + +#define XFS_BLI_CHUNK 128 +#define XFS_BLI_SHIFT 7 +#define BIT_TO_WORD_SHIFT 5 +#define NBWORD (NBBY * sizeof(unsigned int)) + +/* + * buf log item flags + */ +#define XFS_BLI_HOLD 0x01 +#define XFS_BLI_DIRTY 0x02 +#define XFS_BLI_STALE 0x04 +#define XFS_BLI_LOGGED 0x08 +#define XFS_BLI_INODE_ALLOC_BUF 0x10 + + +#ifdef __KERNEL__ + +struct xfs_buf; +struct ktrace; +struct xfs_mount; + +/* + * This is the in core log item structure used to track information + * needed to log buffers. It tracks how many times the lock has been + * locked, and which 128 byte chunks of the buffer are dirty. + */ +typedef struct xfs_buf_log_item { + xfs_log_item_t bli_item; /* common item structure */ + struct xfs_buf *bli_buf; /* real buffer pointer */ + unsigned int bli_flags; /* misc flags */ + unsigned int bli_recur; /* lock recursion count */ + atomic_t bli_refcount; /* cnt of tp refs */ +#ifdef DEBUG + struct ktrace *bli_trace; /* event trace buf */ +#endif +#ifdef XFS_TRANS_DEBUG + char *bli_orig; /* original buffer copy */ + char *bli_logged; /* bytes logged (bitmap) */ +#endif + xfs_buf_log_format_t bli_format; /* in-log header */ +} xfs_buf_log_item_t; + +/* + * This structure is used during recovery to record the buf log + * items which have been canceled and should not be replayed. + */ +typedef struct xfs_buf_cancel { + xfs_daddr_t bc_blkno; + uint bc_len; + int bc_refcount; + struct xfs_buf_cancel *bc_next; +} xfs_buf_cancel_t; + +#define XFS_BLI_TRACE_SIZE 32 + + +#if defined(XFS_ALL_TRACE) +#define XFS_BLI_TRACE +#endif + +#if !defined(DEBUG) +#undef XFS_BLI_TRACE +#endif + +#if defined(XFS_BLI_TRACE) +void xfs_buf_item_trace(char *, xfs_buf_log_item_t *); +#else +#define xfs_buf_item_trace(id, bip) +#endif + +void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); +void xfs_buf_item_relse(struct xfs_buf *); +void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); +uint xfs_buf_item_dirty(xfs_buf_log_item_t *); +void xfs_buf_attach_iodone(struct xfs_buf *, + void(*)(struct xfs_buf *, xfs_log_item_t *), + xfs_log_item_t *); +void xfs_buf_iodone_callbacks(struct xfs_buf *); +void xfs_buf_iodone(struct xfs_buf *, xfs_buf_log_item_t *); + +#ifdef XFS_TRANS_DEBUG +void +xfs_buf_item_flush_log_debug( + struct xfs_buf *bp, + uint first, + uint last); +#else +#define xfs_buf_item_flush_log_debug(bp, first, last) +#endif + +#endif /* __KERNEL__ */ + +#endif /* __XFS_BUF_ITEM_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_cap.c linux-2.4-xfs/fs/xfs/xfs_cap.c --- linux-2.4.19/fs/xfs/xfs_cap.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_cap.c Sat Jul 13 01:46:50 2002 @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include + +STATIC int xfs_cap_allow_set(vnode_t *); + + +/* + * Test for existence of capability attribute as efficiently as possible. + */ +int +xfs_cap_vhascap( + vnode_t *vp) +{ + int error; + int len = sizeof(xfs_cap_set_t); + int flags = ATTR_KERNOVAL|ATTR_ROOT; + + VOP_ATTR_GET(vp, SGI_CAP_LINUX, NULL, &len, flags, sys_cred, error); + return (error == 0); +} + +/* + * Convert from extended attribute representation to in-memory for XFS. + */ +STATIC int +posix_cap_xattr_to_xfs( + posix_cap_xattr *src, + size_t size, + xfs_cap_set_t *dest) +{ + if (!src || !dest) + return EINVAL; + + if (src->c_version != cpu_to_le32(POSIX_CAP_XATTR_VERSION)) + return EINVAL; + if (src->c_abiversion != cpu_to_le32(_LINUX_CAPABILITY_VERSION)) + return EINVAL; + + if (size < sizeof(posix_cap_xattr)) + return EINVAL; + + ASSERT(sizeof(dest->cap_effective) == sizeof(src->c_effective)); + + dest->cap_effective = src->c_effective; + dest->cap_permitted = src->c_permitted; + dest->cap_inheritable = src->c_inheritable; + + return 0; +} + +/* + * Convert from in-memory XFS to extended attribute representation. + */ +STATIC int +posix_cap_xfs_to_xattr( + xfs_cap_set_t *src, + posix_cap_xattr *xattr_cap, + size_t size) +{ + size_t new_size = posix_cap_xattr_size(); + + if (size < new_size) + return -ERANGE; + + ASSERT(sizeof(xattr_cap->c_effective) == sizeof(src->cap_effective)); + + xattr_cap->c_version = cpu_to_le32(POSIX_CAP_XATTR_VERSION); + xattr_cap->c_abiversion = cpu_to_le32(_LINUX_CAPABILITY_VERSION); + xattr_cap->c_effective = src->cap_effective; + xattr_cap->c_permitted = src->cap_permitted; + xattr_cap->c_inheritable= src->cap_inheritable; + + return new_size; +} + +int +xfs_cap_vget( + vnode_t *vp, + void *cap, + size_t size) +{ + int error; + int len = sizeof(xfs_cap_set_t); + int flags = ATTR_ROOT; + xfs_cap_set_t xfs_cap = { 0 }; + posix_cap_xattr *xattr_cap = cap; + + VN_HOLD(vp); + if ((error = _MAC_VACCESS(vp, NULL, VREAD))) + goto out; + + if (!size) + flags |= ATTR_KERNOVAL; + VOP_ATTR_GET(vp, SGI_CAP_LINUX, (char *)&xfs_cap, + &len, flags, sys_cred, error); + if (error) + goto out; + ASSERT(len == sizeof(xfs_cap_set_t)); + + error = (size)? -posix_cap_xattr_size() : + -posix_cap_xfs_to_xattr(&xfs_cap, xattr_cap, size); +out: + VN_RELE(vp); + return -error; +} + +int +xfs_cap_vremove( + vnode_t *vp) +{ + int error; + + VN_HOLD(vp); + error = xfs_cap_allow_set(vp); + if (!error) { + VOP_ATTR_REMOVE(vp, SGI_CAP_LINUX, ATTR_ROOT, sys_cred, error); + if (error == ENOATTR) + error = 0; /* 'scool */ + } + VN_RELE(vp); + return -error; +} + +int +xfs_cap_vset( + vnode_t *vp, + void *cap, + size_t size) +{ + posix_cap_xattr *xattr_cap = cap; + xfs_cap_set_t xfs_cap; + int error; + + if (!cap) + return -EINVAL; + + error = posix_cap_xattr_to_xfs(xattr_cap, size, &xfs_cap); + if (error) + return -error; + + VN_HOLD(vp); + error = xfs_cap_allow_set(vp); + if (error) + goto out; + + VOP_ATTR_SET(vp, SGI_CAP_LINUX, (char *)&xfs_cap, + sizeof(xfs_cap_set_t), ATTR_ROOT, sys_cred, error); +out: + VN_RELE(vp); + return -error; +} + +STATIC int +xfs_cap_allow_set( + vnode_t *vp) +{ + vattr_t va; + int error; + + if (vp->v_vfsp->vfs_flag & VFS_RDONLY) + return EROFS; + if ((error = _MAC_VACCESS(vp, NULL, VWRITE))) + return error; + va.va_mask = AT_UID; + VOP_GETATTR(vp, &va, 0, NULL, error); + if (error) + return error; + if (va.va_uid != current->fsuid && !capable(CAP_FOWNER)) + return EPERM; + return error; +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_cap.h linux-2.4-xfs/fs/xfs/xfs_cap.h --- linux-2.4.19/fs/xfs/xfs_cap.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_cap.h Thu Aug 1 12:24:25 2002 @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_CAP_H__ +#define __XFS_CAP_H__ + +/* + * Capabilities + */ +typedef __uint64_t xfs_cap_value_t; + +typedef struct xfs_cap_set { + xfs_cap_value_t cap_effective; /* use in capability checks */ + xfs_cap_value_t cap_permitted; /* combined with file attrs */ + xfs_cap_value_t cap_inheritable;/* pass through exec */ +} xfs_cap_set_t; + +/* On-disk XFS extended attribute names */ +#define SGI_CAP_FILE "SGI_CAP_FILE" +#define SGI_CAP_FILE_SIZE (sizeof(SGI_CAP_FILE)-1) +#define SGI_CAP_LINUX "SGI_CAP_LINUX" +#define SGI_CAP_LINUX_SIZE (sizeof(SGI_CAP_LINUX)-1) + +/* + * For Linux, we take the bitfields directly from capability.h + * and no longer attempt to keep this attribute ondisk compatible + * with IRIX. Since this attribute is only set on exectuables, + * it just doesn't make much sense to try. We do use a different + * named attribute though, to avoid confusion. + */ + +#ifdef __KERNEL__ + +#ifdef CONFIG_FS_POSIX_CAP + +#include + +struct vnode; + +extern int xfs_cap_vhascap(struct vnode *); +extern int xfs_cap_vset(struct vnode *, void *, size_t); +extern int xfs_cap_vget(struct vnode *, void *, size_t); +extern int xfs_cap_vremove(struct vnode *vp); + +#define _CAP_EXISTS xfs_cap_vhascap + +#else +#define xfs_cap_vset(v,p,sz) (-EOPNOTSUPP) +#define xfs_cap_vget(v,p,sz) (-EOPNOTSUPP) +#define xfs_cap_vremove(v) (-EOPNOTSUPP) +#define _CAP_EXISTS (NULL) +#endif + +#endif /* __KERNEL__ */ + +#endif /* __XFS_CAP_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_clnt.h linux-2.4-xfs/fs/xfs/xfs_clnt.h --- linux-2.4.19/fs/xfs/xfs_clnt.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_clnt.h Wed Jul 17 17:28:54 2002 @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_CLNT_H__ +#define __XFS_CLNT_H__ + +/* + * XFS arguments structure, constructed from the arguments we + * are passed via the mount system call. + * + * NOTE: The mount system call is handled differently between + * Linux and IRIX. In IRIX we worked work with a binary data + * structure coming in across the syscall interface from user + * space (the mount userspace knows about each filesystem type + * and the set of valid options for it, and converts the users + * argument string into a binary structure _before_ making the + * system call), and the ABI issues that this implies. + * + * In Linux, we are passed a comma separated set of options; + * ie. a NULL terminated string of characters. Userspace mount + * code does not have any knowledge of mount options expected by + * each filesystem type and so each filesystem parses its mount + * options in kernel space. + * + * For the Linux port, we kept this structure pretty much intact + * and use it internally (because the existing code groks it). + */ +struct xfs_mount_args { + int flags; /* flags -> see XFSMNT_... macros below */ + int logbufs; /* Number of log buffers, -1 to default */ + int logbufsize; /* Size of log buffers, -1 to default */ + char fsname[MAXNAMELEN]; /* data device name */ + char rtname[MAXNAMELEN]; /* realtime device filename */ + char logname[MAXNAMELEN]; /* journal device filename */ + char mtpt[MAXNAMELEN]; /* filesystem mount point */ + int sunit; /* stripe unit (BBs) */ + int swidth; /* stripe width (BBs), multiple of sunit */ + uchar_t iosizelog; /* log2 of the preferred I/O size */ + + /* The remainder is for CXFS support. */ + char **servlist; /* Table of hosts which may be servers */ + int *servlistlen; /* Table of hostname lengths. */ + int slcount; /* Count of hosts which may be servers. */ + int stimeout; /* Server timeout in milliseconds */ + int ctimeout; /* Client timeout in milliseconds */ + char *server; /* Designated server hostname (for remount). */ + int servlen; /* Length of server hostname (for remount). */ + int servcell; /* Server cell (internal testing only) */ +}; + +/* + * XFS mount option flags + */ +#define XFSMNT_CHKLOG 0x00000001 /* check log */ +#define XFSMNT_WSYNC 0x00000002 /* safe mode nfs mount + * compatible */ +#define XFSMNT_INO64 0x00000004 /* move inode numbers up + * past 2^32 */ +#define XFSMNT_UQUOTA 0x00000008 /* user quota accounting */ +#define XFSMNT_PQUOTA 0x00000010 /* IRIX prj quota accounting */ +#define XFSMNT_UQUOTAENF 0x00000020 /* user quota limit + * enforcement */ +#define XFSMNT_PQUOTAENF 0x00000040 /* IRIX project quota limit + * enforcement */ +#define XFSMNT_QUOTAMAYBE 0x00000080 /* don't turn off if SB + * has quotas on */ +#define XFSMNT_NOATIME 0x00000100 /* don't modify access + * times on reads */ +#define XFSMNT_NOALIGN 0x00000200 /* don't allocate at + * stripe boundaries*/ +#define XFSMNT_RETERR 0x00000400 /* return error to user */ +#define XFSMNT_NORECOVERY 0x00000800 /* no recovery, implies + * read-only mount */ +#define XFSMNT_SHARED 0x00001000 /* shared XFS mount */ +#define XFSMNT_IOSIZE 0x00002000 /* optimize for I/O size */ +#define XFSMNT_OSYNCISOSYNC 0x00004000 /* o_sync is REALLY o_sync */ + /* (osyncisdsync is now default) */ +#define XFSMNT_CLNTONLY 0x00008000 /* cxfs mount as client only */ +#define XFSMNT_UNSHARED 0x00010000 /* cxfs filesystem mounted + * unshared */ +#define XFSMNT_CHGCLNTONLY 0x00020000 /* changing client only flag */ + /* (for remount only) */ +#define XFSMNT_SERVCELL 0x00040000 /* setting server cell */ + /* (allowed on remount) */ +#define XFSMNT_MAKESERVER 0x00080000 /* become the server (remount */ + /* only) */ +#define XFSMNT_NOTSERVER 0x00100000 /* give up being the server */ + /* (remount only) */ +#define XFSMNT_DMAPI 0x00200000 /* enable dmapi/xdsm */ +#define XFSMNT_GQUOTA 0x00400000 /* group quota accounting */ +#define XFSMNT_GQUOTAENF 0x00800000 /* group quota limit + * enforcement */ +#define XFSMNT_NOUUID 0x01000000 /* Ignore fs uuid */ +#define XFSMNT_32BITINODES 0x02000000 /* restrict inodes to 32 + * bits of address space */ +#define XFSMNT_IRIXSGID 0x04000000 /* Irix-style sgid inheritance */ +#define XFSMNT_NOLOGFLUSH 0x08000000 /* Don't flush for log blocks */ + +/* Did we get any args for CXFS to consume? */ +#define XFSARGS_FOR_CXFSARR(ap) \ + ((ap)->servlist || (ap)->slcount >= 0 || \ + (ap)->stimeout >= 0 || (ap)->ctimeout >= 0 || \ + (ap)->flags & (XFSMNT_CLNTONLY | XFSMNT_UNSHARED)) + +#endif /* __XFS_CLNT_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_da_btree.c linux-2.4-xfs/fs/xfs/xfs_da_btree.c --- linux-2.4.19/fs/xfs/xfs_da_btree.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_da_btree.c Thu Aug 8 20:03:32 2002 @@ -0,0 +1,2579 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include + + +#if defined(XFSDEBUG) && defined(CONFIG_KDB) +#undef xfs_buftrace +#define xfs_buftrace(A,B) \ + printk(" xfs_buftrace : %s (0x%p)\n", A, B); \ + BUG(); +#endif + +/* + * xfs_da_btree.c + * + * Routines to implement directories as Btrees of hashed names. + */ + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Routines used for growing the Btree. + */ +STATIC int xfs_da_root_split(xfs_da_state_t *state, + xfs_da_state_blk_t *existing_root, + xfs_da_state_blk_t *new_child); +STATIC int xfs_da_node_split(xfs_da_state_t *state, + xfs_da_state_blk_t *existing_blk, + xfs_da_state_blk_t *split_blk, + xfs_da_state_blk_t *blk_to_add, + int treelevel, + int *result); +STATIC void xfs_da_node_rebalance(xfs_da_state_t *state, + xfs_da_state_blk_t *node_blk_1, + xfs_da_state_blk_t *node_blk_2); +STATIC void xfs_da_node_add(xfs_da_state_t *state, + xfs_da_state_blk_t *old_node_blk, + xfs_da_state_blk_t *new_node_blk); + +/* + * Routines used for shrinking the Btree. + */ +STATIC int xfs_da_root_join(xfs_da_state_t *state, + xfs_da_state_blk_t *root_blk); +STATIC int xfs_da_node_toosmall(xfs_da_state_t *state, int *retval); +STATIC void xfs_da_node_remove(xfs_da_state_t *state, + xfs_da_state_blk_t *drop_blk); +STATIC void xfs_da_node_unbalance(xfs_da_state_t *state, + xfs_da_state_blk_t *src_node_blk, + xfs_da_state_blk_t *dst_node_blk); + +/* + * Utility routines. + */ +STATIC uint xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count); +STATIC int xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp); +STATIC xfs_dabuf_t *xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra); + + +/*======================================================================== + * Routines used for growing the Btree. + *========================================================================*/ + +/* + * Create the initial contents of an intermediate node. + */ +int +xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, + xfs_dabuf_t **bpp, int whichfork) +{ + xfs_da_intnode_t *node; + xfs_dabuf_t *bp; + int error; + xfs_trans_t *tp; + + tp = args->trans; + error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork); + if (error) + return(error); + ASSERT(bp != NULL); + node = bp->data; + INT_ZERO(node->hdr.info.forw, ARCH_CONVERT); + INT_ZERO(node->hdr.info.back, ARCH_CONVERT); + INT_SET(node->hdr.info.magic, ARCH_CONVERT, XFS_DA_NODE_MAGIC); + INT_ZERO(node->hdr.info.pad, ARCH_CONVERT); + INT_ZERO(node->hdr.count, ARCH_CONVERT); + INT_SET(node->hdr.level, ARCH_CONVERT, level); + + xfs_da_log_buf(tp, bp, + XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + + *bpp = bp; + return(0); +} + +/* + * Split a leaf node, rebalance, then possibly split + * intermediate nodes, rebalance, etc. + */ +int /* error */ +xfs_da_split(xfs_da_state_t *state) +{ + xfs_da_state_blk_t *oldblk, *newblk, *addblk; + xfs_da_intnode_t *node; + xfs_dabuf_t *bp; + int max, action, error, i; + + /* + * Walk back up the tree splitting/inserting/adjusting as necessary. + * If we need to insert and there isn't room, split the node, then + * decide which fragment to insert the new block from below into. + * Note that we may split the root this way, but we need more fixup. + */ + max = state->path.active - 1; + ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH)); + ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC || + state->path.blk[max].magic == XFS_DIRX_LEAF_MAGIC(state->mp)); + + addblk = &state->path.blk[max]; /* initial dummy value */ + for (i = max; (i >= 0) && addblk; state->path.active--, i--) { + oldblk = &state->path.blk[i]; + newblk = &state->altpath.blk[i]; + + /* + * If a leaf node then + * Allocate a new leaf node, then rebalance across them. + * else if an intermediate node then + * We split on the last layer, must we split the node? + */ + switch (oldblk->magic) { + case XFS_ATTR_LEAF_MAGIC: +#ifndef __KERNEL__ + return(ENOTTY); +#else + error = xfs_attr_leaf_split(state, oldblk, newblk); + if ((error != 0) && (error != ENOSPC)) { + return(error); /* GROT: attr is inconsistent */ + } + if (!error) { + addblk = newblk; + break; + } + /* + * Entry wouldn't fit, split the leaf again. + */ + state->extravalid = 1; + if (state->inleaf) { + state->extraafter = 0; /* before newblk */ + error = xfs_attr_leaf_split(state, oldblk, + &state->extrablk); + } else { + state->extraafter = 1; /* after newblk */ + error = xfs_attr_leaf_split(state, newblk, + &state->extrablk); + } + if (error) + return(error); /* GROT: attr inconsistent */ + addblk = newblk; + break; +#endif + case XFS_DIR_LEAF_MAGIC: + ASSERT(XFS_DIR_IS_V1(state->mp)); + error = xfs_dir_leaf_split(state, oldblk, newblk); + if ((error != 0) && (error != ENOSPC)) { + return(error); /* GROT: dir is inconsistent */ + } + if (!error) { + addblk = newblk; + break; + } + /* + * Entry wouldn't fit, split the leaf again. + */ + state->extravalid = 1; + if (state->inleaf) { + state->extraafter = 0; /* before newblk */ + error = xfs_dir_leaf_split(state, oldblk, + &state->extrablk); + if (error) + return(error); /* GROT: dir incon. */ + addblk = newblk; + } else { + state->extraafter = 1; /* after newblk */ + error = xfs_dir_leaf_split(state, newblk, + &state->extrablk); + if (error) + return(error); /* GROT: dir incon. */ + addblk = newblk; + } + break; + case XFS_DIR2_LEAFN_MAGIC: + ASSERT(XFS_DIR_IS_V2(state->mp)); + error = xfs_dir2_leafn_split(state, oldblk, newblk); + if (error) + return error; + addblk = newblk; + break; + case XFS_DA_NODE_MAGIC: + error = xfs_da_node_split(state, oldblk, newblk, addblk, + max - i, &action); + xfs_da_buf_done(addblk->bp); + addblk->bp = NULL; + if (error) + return(error); /* GROT: dir is inconsistent */ + /* + * Record the newly split block for the next time thru? + */ + if (action) + addblk = newblk; + else + addblk = NULL; + break; + } + + /* + * Update the btree to show the new hashval for this child. + */ + xfs_da_fixhashpath(state, &state->path); + /* + * If we won't need this block again, it's getting dropped + * from the active path by the loop control, so we need + * to mark it done now. + */ + if (i > 0 || !addblk) + xfs_da_buf_done(oldblk->bp); + } + if (!addblk) + return(0); + + /* + * Split the root node. + */ + ASSERT(state->path.active == 0); + oldblk = &state->path.blk[0]; + error = xfs_da_root_split(state, oldblk, addblk); + if (error) { + xfs_da_buf_done(oldblk->bp); + xfs_da_buf_done(addblk->bp); + addblk->bp = NULL; + return(error); /* GROT: dir is inconsistent */ + } + + /* + * Update pointers to the node which used to be block 0 and + * just got bumped because of the addition of a new root node. + * There might be three blocks involved if a double split occurred, + * and the original block 0 could be at any position in the list. + */ + + node = oldblk->bp->data; + if (!INT_ISZERO(node->hdr.info.forw, ARCH_CONVERT)) { + if (INT_GET(node->hdr.info.forw, ARCH_CONVERT) == addblk->blkno) { + bp = addblk->bp; + } else { + ASSERT(state->extravalid); + bp = state->extrablk.bp; + } + node = bp->data; + INT_SET(node->hdr.info.back, ARCH_CONVERT, oldblk->blkno); + xfs_da_log_buf(state->args->trans, bp, + XFS_DA_LOGRANGE(node, &node->hdr.info, + sizeof(node->hdr.info))); + } + node = oldblk->bp->data; + if (INT_GET(node->hdr.info.back, ARCH_CONVERT)) { + if (INT_GET(node->hdr.info.back, ARCH_CONVERT) == addblk->blkno) { + bp = addblk->bp; + } else { + ASSERT(state->extravalid); + bp = state->extrablk.bp; + } + node = bp->data; + INT_SET(node->hdr.info.forw, ARCH_CONVERT, oldblk->blkno); + xfs_da_log_buf(state->args->trans, bp, + XFS_DA_LOGRANGE(node, &node->hdr.info, + sizeof(node->hdr.info))); + } + xfs_da_buf_done(oldblk->bp); + xfs_da_buf_done(addblk->bp); + addblk->bp = NULL; + return(0); +} + +/* + * Split the root. We have to create a new root and point to the two + * parts (the split old root) that we just created. Copy block zero to + * the EOF, extending the inode in process. + */ +STATIC int /* error */ +xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, + xfs_da_state_blk_t *blk2) +{ + xfs_da_intnode_t *node, *oldroot; + xfs_da_args_t *args; + xfs_dablk_t blkno; + xfs_dabuf_t *bp; + int error, size; + xfs_inode_t *dp; + xfs_trans_t *tp; + xfs_mount_t *mp; + xfs_dir2_leaf_t *leaf; + + /* + * Copy the existing (incorrect) block from the root node position + * to a free space somewhere. + */ + args = state->args; + ASSERT(args != NULL); + error = xfs_da_grow_inode(args, &blkno); + if (error) + return(error); + dp = args->dp; + tp = args->trans; + mp = state->mp; + error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork); + if (error) + return(error); + ASSERT(bp != NULL); + node = bp->data; + oldroot = blk1->bp->data; + if (INT_GET(oldroot->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) { + size = (int)((char *)&oldroot->btree[INT_GET(oldroot->hdr.count, ARCH_CONVERT)] - + (char *)oldroot); + } else { + ASSERT(XFS_DIR_IS_V2(mp)); + ASSERT(INT_GET(oldroot->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC); + leaf = (xfs_dir2_leaf_t *)oldroot; + size = (int)((char *)&leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT)] - + (char *)leaf); + } + bcopy(oldroot, node, size); + xfs_da_log_buf(tp, bp, 0, size - 1); + xfs_da_buf_done(blk1->bp); + blk1->bp = bp; + blk1->blkno = blkno; + + /* + * Set up the new root node. + */ + error = xfs_da_node_create(args, + args->whichfork == XFS_DATA_FORK && + XFS_DIR_IS_V2(mp) ? mp->m_dirleafblk : 0, + INT_GET(node->hdr.level, ARCH_CONVERT) + 1, &bp, args->whichfork); + if (error) + return(error); + node = bp->data; + INT_SET(node->btree[0].hashval, ARCH_CONVERT, blk1->hashval); + INT_SET(node->btree[0].before, ARCH_CONVERT, blk1->blkno); + INT_SET(node->btree[1].hashval, ARCH_CONVERT, blk2->hashval); + INT_SET(node->btree[1].before, ARCH_CONVERT, blk2->blkno); + INT_SET(node->hdr.count, ARCH_CONVERT, 2); + if (XFS_DIR_IS_V2(mp)) { + ASSERT(blk1->blkno >= mp->m_dirleafblk && + blk1->blkno < mp->m_dirfreeblk); + ASSERT(blk2->blkno >= mp->m_dirleafblk && + blk2->blkno < mp->m_dirfreeblk); + } + /* Header is already logged by xfs_da_node_create */ + xfs_da_log_buf(tp, bp, + XFS_DA_LOGRANGE(node, node->btree, + sizeof(xfs_da_node_entry_t) * 2)); + xfs_da_buf_done(bp); + + return(0); +} + +/* + * Split the node, rebalance, then add the new entry. + */ +STATIC int /* error */ +xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, + xfs_da_state_blk_t *newblk, + xfs_da_state_blk_t *addblk, + int treelevel, int *result) +{ + xfs_da_intnode_t *node; + xfs_dablk_t blkno; + int newcount, error; + int useextra; + + node = oldblk->bp->data; + ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + + /* + * With V2 the extra block is data or freespace. + */ + useextra = state->extravalid && XFS_DIR_IS_V1(state->mp); + newcount = 1 + useextra; + /* + * Do we have to split the node? + */ + if ((INT_GET(node->hdr.count, ARCH_CONVERT) + newcount) > XFS_DA_NODE_ENTRIES(state->mp)) { + /* + * Allocate a new node, add to the doubly linked chain of + * nodes, then move some of our excess entries into it. + */ + error = xfs_da_grow_inode(state->args, &blkno); + if (error) + return(error); /* GROT: dir is inconsistent */ + + error = xfs_da_node_create(state->args, blkno, treelevel, + &newblk->bp, state->args->whichfork); + if (error) + return(error); /* GROT: dir is inconsistent */ + newblk->blkno = blkno; + newblk->magic = XFS_DA_NODE_MAGIC; + xfs_da_node_rebalance(state, oldblk, newblk); + error = xfs_da_blk_link(state, oldblk, newblk); + if (error) + return(error); + *result = 1; + } else { + *result = 0; + } + + /* + * Insert the new entry(s) into the correct block + * (updating last hashval in the process). + * + * xfs_da_node_add() inserts BEFORE the given index, + * and as a result of using node_lookup_int() we always + * point to a valid entry (not after one), but a split + * operation always results in a new block whose hashvals + * FOLLOW the current block. + * + * If we had double-split op below us, then add the extra block too. + */ + node = oldblk->bp->data; + if (oldblk->index <= INT_GET(node->hdr.count, ARCH_CONVERT)) { + oldblk->index++; + xfs_da_node_add(state, oldblk, addblk); + if (useextra) { + if (state->extraafter) + oldblk->index++; + xfs_da_node_add(state, oldblk, &state->extrablk); + state->extravalid = 0; + } + } else { + newblk->index++; + xfs_da_node_add(state, newblk, addblk); + if (useextra) { + if (state->extraafter) + newblk->index++; + xfs_da_node_add(state, newblk, &state->extrablk); + state->extravalid = 0; + } + } + + return(0); +} + +/* + * Balance the btree elements between two intermediate nodes, + * usually one full and one empty. + * + * NOTE: if blk2 is empty, then it will get the upper half of blk1. + */ +STATIC void +xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, + xfs_da_state_blk_t *blk2) +{ + xfs_da_intnode_t *node1, *node2, *tmpnode; + xfs_da_node_entry_t *btree_s, *btree_d; + int count, tmp; + xfs_trans_t *tp; + + node1 = blk1->bp->data; + node2 = blk2->bp->data; + /* + * Figure out how many entries need to move, and in which direction. + * Swap the nodes around if that makes it simpler. + */ + if ((INT_GET(node1->hdr.count, ARCH_CONVERT) > 0) && (INT_GET(node2->hdr.count, ARCH_CONVERT) > 0) && + ((INT_GET(node2->btree[ 0 ].hashval, ARCH_CONVERT) < INT_GET(node1->btree[ 0 ].hashval, ARCH_CONVERT)) || + (INT_GET(node2->btree[ INT_GET(node2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) < + INT_GET(node1->btree[ INT_GET(node1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))) { + tmpnode = node1; + node1 = node2; + node2 = tmpnode; + } + ASSERT(INT_GET(node1->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + ASSERT(INT_GET(node2->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + count = (INT_GET(node1->hdr.count, ARCH_CONVERT) - INT_GET(node2->hdr.count, ARCH_CONVERT)) / 2; + if (count == 0) + return; + tp = state->args->trans; + /* + * Two cases: high-to-low and low-to-high. + */ + if (count > 0) { + /* + * Move elements in node2 up to make a hole. + */ + if ((tmp = INT_GET(node2->hdr.count, ARCH_CONVERT)) > 0) { + tmp *= (uint)sizeof(xfs_da_node_entry_t); + btree_s = &node2->btree[0]; + btree_d = &node2->btree[count]; + ovbcopy(btree_s, btree_d, tmp); + } + + /* + * Move the req'd B-tree elements from high in node1 to + * low in node2. + */ + INT_MOD(node2->hdr.count, ARCH_CONVERT, count); + tmp = count * (uint)sizeof(xfs_da_node_entry_t); + btree_s = &node1->btree[INT_GET(node1->hdr.count, ARCH_CONVERT) - count]; + btree_d = &node2->btree[0]; + bcopy(btree_s, btree_d, tmp); + INT_MOD(node1->hdr.count, ARCH_CONVERT, -(count)); + + } else { + /* + * Move the req'd B-tree elements from low in node2 to + * high in node1. + */ + count = -count; + tmp = count * (uint)sizeof(xfs_da_node_entry_t); + btree_s = &node2->btree[0]; + btree_d = &node1->btree[INT_GET(node1->hdr.count, ARCH_CONVERT)]; + bcopy(btree_s, btree_d, tmp); + INT_MOD(node1->hdr.count, ARCH_CONVERT, count); + xfs_da_log_buf(tp, blk1->bp, + XFS_DA_LOGRANGE(node1, btree_d, tmp)); + + /* + * Move elements in node2 down to fill the hole. + */ + tmp = INT_GET(node2->hdr.count, ARCH_CONVERT) - count; + tmp *= (uint)sizeof(xfs_da_node_entry_t); + btree_s = &node2->btree[count]; + btree_d = &node2->btree[0]; + ovbcopy(btree_s, btree_d, tmp); + INT_MOD(node2->hdr.count, ARCH_CONVERT, -(count)); + } + + /* + * Log header of node 1 and all current bits of node 2. + */ + xfs_da_log_buf(tp, blk1->bp, + XFS_DA_LOGRANGE(node1, &node1->hdr, sizeof(node1->hdr))); + xfs_da_log_buf(tp, blk2->bp, + XFS_DA_LOGRANGE(node2, &node2->hdr, + sizeof(node2->hdr) + + sizeof(node2->btree[0]) * INT_GET(node2->hdr.count, ARCH_CONVERT))); + + /* + * Record the last hashval from each block for upward propagation. + * (note: don't use the swapped node pointers) + */ + node1 = blk1->bp->data; + node2 = blk2->bp->data; + blk1->hashval = INT_GET(node1->btree[ INT_GET(node1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT); + blk2->hashval = INT_GET(node2->btree[ INT_GET(node2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT); + + /* + * Adjust the expected index for insertion. + */ + if (blk1->index >= INT_GET(node1->hdr.count, ARCH_CONVERT)) { + blk2->index = blk1->index - INT_GET(node1->hdr.count, ARCH_CONVERT); + blk1->index = INT_GET(node1->hdr.count, ARCH_CONVERT) + 1; /* make it invalid */ + } +} + +/* + * Add a new entry to an intermediate node. + */ +STATIC void +xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, + xfs_da_state_blk_t *newblk) +{ + xfs_da_intnode_t *node; + xfs_da_node_entry_t *btree; + int tmp; + xfs_mount_t *mp; + + node = oldblk->bp->data; + mp = state->mp; + ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + ASSERT((oldblk->index >= 0) && (oldblk->index <= INT_GET(node->hdr.count, ARCH_CONVERT))); + ASSERT(newblk->blkno != 0); + if (state->args->whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(mp)) + ASSERT(newblk->blkno >= mp->m_dirleafblk && + newblk->blkno < mp->m_dirfreeblk); + + /* + * We may need to make some room before we insert the new node. + */ + tmp = 0; + btree = &node->btree[ oldblk->index ]; + if (oldblk->index < INT_GET(node->hdr.count, ARCH_CONVERT)) { + tmp = (INT_GET(node->hdr.count, ARCH_CONVERT) - oldblk->index) * (uint)sizeof(*btree); + ovbcopy(btree, btree + 1, tmp); + } + INT_SET(btree->hashval, ARCH_CONVERT, newblk->hashval); + INT_SET(btree->before, ARCH_CONVERT, newblk->blkno); + xfs_da_log_buf(state->args->trans, oldblk->bp, + XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree))); + INT_MOD(node->hdr.count, ARCH_CONVERT, +1); + xfs_da_log_buf(state->args->trans, oldblk->bp, + XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + + /* + * Copy the last hash value from the oldblk to propagate upwards. + */ + oldblk->hashval = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT); +} + +/*======================================================================== + * Routines used for shrinking the Btree. + *========================================================================*/ + +/* + * Deallocate an empty leaf node, remove it from its parent, + * possibly deallocating that block, etc... + */ +int +xfs_da_join(xfs_da_state_t *state) +{ + xfs_da_state_blk_t *drop_blk, *save_blk; + int action, error; + + action = 0; + drop_blk = &state->path.blk[ state->path.active-1 ]; + save_blk = &state->altpath.blk[ state->path.active-1 ]; + ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC); + ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC || + drop_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp)); + + /* + * Walk back up the tree joining/deallocating as necessary. + * When we stop dropping blocks, break out. + */ + for ( ; state->path.active >= 2; drop_blk--, save_blk--, + state->path.active--) { + /* + * See if we can combine the block with a neighbor. + * (action == 0) => no options, just leave + * (action == 1) => coalesce, then unlink + * (action == 2) => block empty, unlink it + */ + switch (drop_blk->magic) { + case XFS_ATTR_LEAF_MAGIC: +#ifndef __KERNEL__ + error = ENOTTY; +#else + error = xfs_attr_leaf_toosmall(state, &action); +#endif + if (error) + return(error); + if (action == 0) + return(0); +#ifdef __KERNEL__ + xfs_attr_leaf_unbalance(state, drop_blk, save_blk); +#endif + break; + case XFS_DIR_LEAF_MAGIC: + ASSERT(XFS_DIR_IS_V1(state->mp)); + error = xfs_dir_leaf_toosmall(state, &action); + if (error) + return(error); + if (action == 0) + return(0); + xfs_dir_leaf_unbalance(state, drop_blk, save_blk); + break; + case XFS_DIR2_LEAFN_MAGIC: + ASSERT(XFS_DIR_IS_V2(state->mp)); + error = xfs_dir2_leafn_toosmall(state, &action); + if (error) + return error; + if (action == 0) + return 0; + xfs_dir2_leafn_unbalance(state, drop_blk, save_blk); + break; + case XFS_DA_NODE_MAGIC: + /* + * Remove the offending node, fixup hashvals, + * check for a toosmall neighbor. + */ + xfs_da_node_remove(state, drop_blk); + xfs_da_fixhashpath(state, &state->path); + error = xfs_da_node_toosmall(state, &action); + if (error) + return(error); + if (action == 0) + return 0; + xfs_da_node_unbalance(state, drop_blk, save_blk); + break; + } + xfs_da_fixhashpath(state, &state->altpath); + error = xfs_da_blk_unlink(state, drop_blk, save_blk); + xfs_da_state_kill_altpath(state); + if (error) + return(error); + error = xfs_da_shrink_inode(state->args, drop_blk->blkno, + drop_blk->bp); + drop_blk->bp = NULL; + if (error) + return(error); + } + /* + * We joined all the way to the top. If it turns out that + * we only have one entry in the root, make the child block + * the new root. + */ + xfs_da_node_remove(state, drop_blk); + xfs_da_fixhashpath(state, &state->path); + error = xfs_da_root_join(state, &state->path.blk[0]); + return(error); +} + +/* + * We have only one entry in the root. Copy the only remaining child of + * the old root to block 0 as the new root node. + */ +STATIC int +xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) +{ + xfs_da_intnode_t *oldroot; + /* REFERENCED */ + xfs_da_blkinfo_t *blkinfo; + xfs_da_args_t *args; + xfs_dablk_t child; + xfs_dabuf_t *bp; + int error; + + args = state->args; + ASSERT(args != NULL); + ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC); + oldroot = root_blk->bp->data; + ASSERT(INT_GET(oldroot->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + ASSERT(INT_ISZERO(oldroot->hdr.info.forw, ARCH_CONVERT)); + ASSERT(INT_ISZERO(oldroot->hdr.info.back, ARCH_CONVERT)); + + /* + * If the root has more than one child, then don't do anything. + */ + if (INT_GET(oldroot->hdr.count, ARCH_CONVERT) > 1) + return(0); + + /* + * Read in the (only) child block, then copy those bytes into + * the root block's buffer and free the original child block. + */ + child = INT_GET(oldroot->btree[ 0 ].before, ARCH_CONVERT); + ASSERT(child != 0); + error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp, + args->whichfork); + if (error) + return(error); + ASSERT(bp != NULL); + blkinfo = bp->data; + if (INT_GET(oldroot->hdr.level, ARCH_CONVERT) == 1) { + ASSERT(INT_GET(blkinfo->magic, ARCH_CONVERT) == XFS_DIRX_LEAF_MAGIC(state->mp) || + INT_GET(blkinfo->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC); + } else { + ASSERT(INT_GET(blkinfo->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + } + ASSERT(INT_ISZERO(blkinfo->forw, ARCH_CONVERT)); + ASSERT(INT_ISZERO(blkinfo->back, ARCH_CONVERT)); + bcopy(bp->data, root_blk->bp->data, state->blocksize); + xfs_da_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); + error = xfs_da_shrink_inode(args, child, bp); + return(error); +} + +/* + * Check a node block and its neighbors to see if the block should be + * collapsed into one or the other neighbor. Always keep the block + * with the smaller block number. + * If the current block is over 50% full, don't try to join it, return 0. + * If the block is empty, fill in the state structure and return 2. + * If it can be collapsed, fill in the state structure and return 1. + * If nothing can be done, return 0. + */ +STATIC int +xfs_da_node_toosmall(xfs_da_state_t *state, int *action) +{ + xfs_da_intnode_t *node; + xfs_da_state_blk_t *blk; + xfs_da_blkinfo_t *info; + int count, forward, error, retval, i; + xfs_dablk_t blkno; + xfs_dabuf_t *bp; + + /* + * Check for the degenerate case of the block being over 50% full. + * If so, it's not worth even looking to see if we might be able + * to coalesce with a sibling. + */ + blk = &state->path.blk[ state->path.active-1 ]; + info = blk->bp->data; + ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + node = (xfs_da_intnode_t *)info; + count = INT_GET(node->hdr.count, ARCH_CONVERT); + if (count > (XFS_DA_NODE_ENTRIES(state->mp) >> 1)) { + *action = 0; /* blk over 50%, dont try to join */ + return(0); /* blk over 50%, dont try to join */ + } + + /* + * Check for the degenerate case of the block being empty. + * If the block is empty, we'll simply delete it, no need to + * coalesce it with a sibling block. We choose (aribtrarily) + * to merge with the forward block unless it is NULL. + */ + if (count == 0) { + /* + * Make altpath point to the block we want to keep and + * path point to the block we want to drop (this one). + */ + forward = (!INT_ISZERO(info->forw, ARCH_CONVERT)); + bcopy(&state->path, &state->altpath, sizeof(state->path)); + error = xfs_da_path_shift(state, &state->altpath, forward, + 0, &retval); + if (error) + return(error); + if (retval) { + *action = 0; + } else { + *action = 2; + } + return(0); + } + + /* + * Examine each sibling block to see if we can coalesce with + * at least 25% free space to spare. We need to figure out + * whether to merge with the forward or the backward block. + * We prefer coalescing with the lower numbered sibling so as + * to shrink a directory over time. + */ + /* start with smaller blk num */ + forward = (INT_GET(info->forw, ARCH_CONVERT) + < INT_GET(info->back, ARCH_CONVERT)); + for (i = 0; i < 2; forward = !forward, i++) { + if (forward) + blkno = INT_GET(info->forw, ARCH_CONVERT); + else + blkno = INT_GET(info->back, ARCH_CONVERT); + if (blkno == 0) + continue; + error = xfs_da_read_buf(state->args->trans, state->args->dp, + blkno, -1, &bp, state->args->whichfork); + if (error) + return(error); + ASSERT(bp != NULL); + + node = (xfs_da_intnode_t *)info; + count = XFS_DA_NODE_ENTRIES(state->mp); + count -= XFS_DA_NODE_ENTRIES(state->mp) >> 2; + count -= INT_GET(node->hdr.count, ARCH_CONVERT); + node = bp->data; + ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + count -= INT_GET(node->hdr.count, ARCH_CONVERT); + xfs_da_brelse(state->args->trans, bp); + if (count >= 0) + break; /* fits with at least 25% to spare */ + } + if (i >= 2) { + *action = 0; + return(0); + } + + /* + * Make altpath point to the block we want to keep (the lower + * numbered block) and path point to the block we want to drop. + */ + bcopy(&state->path, &state->altpath, sizeof(state->path)); + if (blkno < blk->blkno) { + error = xfs_da_path_shift(state, &state->altpath, forward, + 0, &retval); + if (error) { + return(error); + } + if (retval) { + *action = 0; + return(0); + } + } else { + error = xfs_da_path_shift(state, &state->path, forward, + 0, &retval); + if (error) { + return(error); + } + if (retval) { + *action = 0; + return(0); + } + } + *action = 1; + return(0); +} + +/* + * Walk back up the tree adjusting hash values as necessary, + * when we stop making changes, return. + */ +void +xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) +{ + xfs_da_state_blk_t *blk; + xfs_da_intnode_t *node; + xfs_da_node_entry_t *btree; + xfs_dahash_t lasthash=0; + int level, count; + + level = path->active-1; + blk = &path->blk[ level ]; + switch (blk->magic) { +#ifdef __KERNEL__ + case XFS_ATTR_LEAF_MAGIC: + lasthash = xfs_attr_leaf_lasthash(blk->bp, &count); + if (count == 0) + return; + break; +#endif + case XFS_DIR_LEAF_MAGIC: + ASSERT(XFS_DIR_IS_V1(state->mp)); + lasthash = xfs_dir_leaf_lasthash(blk->bp, &count); + if (count == 0) + return; + break; + case XFS_DIR2_LEAFN_MAGIC: + ASSERT(XFS_DIR_IS_V2(state->mp)); + lasthash = xfs_dir2_leafn_lasthash(blk->bp, &count); + if (count == 0) + return; + break; + case XFS_DA_NODE_MAGIC: + lasthash = xfs_da_node_lasthash(blk->bp, &count); + if (count == 0) + return; + break; + } + for (blk--, level--; level >= 0; blk--, level--) { + node = blk->bp->data; + ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + btree = &node->btree[ blk->index ]; + if (INT_GET(btree->hashval, ARCH_CONVERT) == lasthash) + break; + blk->hashval = lasthash; + INT_SET(btree->hashval, ARCH_CONVERT, lasthash); + xfs_da_log_buf(state->args->trans, blk->bp, + XFS_DA_LOGRANGE(node, btree, sizeof(*btree))); + + lasthash = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT); + } +} + +/* + * Remove an entry from an intermediate node. + */ +STATIC void +xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk) +{ + xfs_da_intnode_t *node; + xfs_da_node_entry_t *btree; + int tmp; + + node = drop_blk->bp->data; + ASSERT(drop_blk->index < INT_GET(node->hdr.count, ARCH_CONVERT)); + ASSERT(drop_blk->index >= 0); + + /* + * Copy over the offending entry, or just zero it out. + */ + btree = &node->btree[drop_blk->index]; + if (drop_blk->index < (INT_GET(node->hdr.count, ARCH_CONVERT)-1)) { + tmp = INT_GET(node->hdr.count, ARCH_CONVERT) - drop_blk->index - 1; + tmp *= (uint)sizeof(xfs_da_node_entry_t); + ovbcopy(btree + 1, btree, tmp); + xfs_da_log_buf(state->args->trans, drop_blk->bp, + XFS_DA_LOGRANGE(node, btree, tmp)); + btree = &node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ]; + } + bzero((char *)btree, sizeof(xfs_da_node_entry_t)); + xfs_da_log_buf(state->args->trans, drop_blk->bp, + XFS_DA_LOGRANGE(node, btree, sizeof(*btree))); + INT_MOD(node->hdr.count, ARCH_CONVERT, -1); + xfs_da_log_buf(state->args->trans, drop_blk->bp, + XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + + /* + * Copy the last hash value from the block to propagate upwards. + */ + btree--; + drop_blk->hashval = INT_GET(btree->hashval, ARCH_CONVERT); +} + +/* + * Unbalance the btree elements between two intermediate nodes, + * move all Btree elements from one node into another. + */ +STATIC void +xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, + xfs_da_state_blk_t *save_blk) +{ + xfs_da_intnode_t *drop_node, *save_node; + xfs_da_node_entry_t *btree; + int tmp; + xfs_trans_t *tp; + + drop_node = drop_blk->bp->data; + save_node = save_blk->bp->data; + ASSERT(INT_GET(drop_node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + ASSERT(INT_GET(save_node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + tp = state->args->trans; + + /* + * If the dying block has lower hashvals, then move all the + * elements in the remaining block up to make a hole. + */ + if ((INT_GET(drop_node->btree[ 0 ].hashval, ARCH_CONVERT) < INT_GET(save_node->btree[ 0 ].hashval, ARCH_CONVERT)) || + (INT_GET(drop_node->btree[ INT_GET(drop_node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) < + INT_GET(save_node->btree[ INT_GET(save_node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT))) + { + btree = &save_node->btree[ INT_GET(drop_node->hdr.count, ARCH_CONVERT) ]; + tmp = INT_GET(save_node->hdr.count, ARCH_CONVERT) * (uint)sizeof(xfs_da_node_entry_t); + ovbcopy(&save_node->btree[0], btree, tmp); + btree = &save_node->btree[0]; + xfs_da_log_buf(tp, save_blk->bp, + XFS_DA_LOGRANGE(save_node, btree, + (INT_GET(save_node->hdr.count, ARCH_CONVERT) + INT_GET(drop_node->hdr.count, ARCH_CONVERT)) * + sizeof(xfs_da_node_entry_t))); + } else { + btree = &save_node->btree[ INT_GET(save_node->hdr.count, ARCH_CONVERT) ]; + xfs_da_log_buf(tp, save_blk->bp, + XFS_DA_LOGRANGE(save_node, btree, + INT_GET(drop_node->hdr.count, ARCH_CONVERT) * + sizeof(xfs_da_node_entry_t))); + } + + /* + * Move all the B-tree elements from drop_blk to save_blk. + */ + tmp = INT_GET(drop_node->hdr.count, ARCH_CONVERT) * (uint)sizeof(xfs_da_node_entry_t); + bcopy(&drop_node->btree[0], btree, tmp); + INT_MOD(save_node->hdr.count, ARCH_CONVERT, INT_GET(drop_node->hdr.count, ARCH_CONVERT)); + + xfs_da_log_buf(tp, save_blk->bp, + XFS_DA_LOGRANGE(save_node, &save_node->hdr, + sizeof(save_node->hdr))); + + /* + * Save the last hashval in the remaining block for upward propagation. + */ + save_blk->hashval = INT_GET(save_node->btree[ INT_GET(save_node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT); +} + +/*======================================================================== + * Routines used for finding things in the Btree. + *========================================================================*/ + +/* + * Walk down the Btree looking for a particular filename, filling + * in the state structure as we go. + * + * We will set the state structure to point to each of the elements + * in each of the nodes where either the hashval is or should be. + * + * We support duplicate hashval's so for each entry in the current + * node that could contain the desired hashval, descend. This is a + * pruned depth-first tree search. + */ +int /* error */ +xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) +{ + xfs_da_state_blk_t *blk; + xfs_da_blkinfo_t *curr; + xfs_da_intnode_t *node; + xfs_da_node_entry_t *btree; + xfs_dablk_t blkno; + int probe, span, max, error, retval; + xfs_dahash_t hashval; + xfs_da_args_t *args; + + args = state->args; + /* + * Descend thru the B-tree searching each level for the right + * node to use, until the right hashval is found. + */ + if (args->whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(state->mp)) + blkno = state->mp->m_dirleafblk; + else + blkno = 0; + for (blk = &state->path.blk[0], state->path.active = 1; + state->path.active <= XFS_DA_NODE_MAXDEPTH; + blk++, state->path.active++) { + /* + * Read the next node down in the tree. + */ + blk->blkno = blkno; + error = xfs_da_read_buf(state->args->trans, state->args->dp, + blkno, -1, &blk->bp, + state->args->whichfork); + if (error) { + blk->blkno = 0; + state->path.active--; + return(error); + } + ASSERT(blk->bp != NULL); + curr = blk->bp->data; + ASSERT(INT_GET(curr->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC || + INT_GET(curr->magic, ARCH_CONVERT) == XFS_DIRX_LEAF_MAGIC(state->mp) || + INT_GET(curr->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC); + + /* + * Search an intermediate node for a match. + */ + blk->magic = INT_GET(curr->magic, ARCH_CONVERT); + if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) { + node = blk->bp->data; + blk->hashval = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT); + + /* + * Binary search. (note: small blocks will skip loop) + */ + max = INT_GET(node->hdr.count, ARCH_CONVERT); + probe = span = max / 2; + hashval = state->args->hashval; + for (btree = &node->btree[probe]; span > 4; + btree = &node->btree[probe]) { + span /= 2; + if (INT_GET(btree->hashval, ARCH_CONVERT) < hashval) + probe += span; + else if (INT_GET(btree->hashval, ARCH_CONVERT) > hashval) + probe -= span; + else + break; + } + ASSERT((probe >= 0) && (probe < max)); + ASSERT((span <= 4) || (INT_GET(btree->hashval, ARCH_CONVERT) == hashval)); + + /* + * Since we may have duplicate hashval's, find the first + * matching hashval in the node. + */ + while ((probe > 0) && (INT_GET(btree->hashval, ARCH_CONVERT) >= hashval)) { + btree--; + probe--; + } + while ((probe < max) && (INT_GET(btree->hashval, ARCH_CONVERT) < hashval)) { + btree++; + probe++; + } + + /* + * Pick the right block to descend on. + */ + if (probe == max) { + blk->index = max-1; + blkno = INT_GET(node->btree[ max-1 ].before, ARCH_CONVERT); + } else { + blk->index = probe; + blkno = INT_GET(btree->before, ARCH_CONVERT); + } + } +#ifdef __KERNEL__ + else if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC) { + blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); + break; + } +#endif + else if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC) { + blk->hashval = xfs_dir_leaf_lasthash(blk->bp, NULL); + break; + } + else if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC) { + blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL); + break; + } + } + + /* + * A leaf block that ends in the hashval that we are interested in + * (final hashval == search hashval) means that the next block may + * contain more entries with the same hashval, shift upward to the + * next leaf and keep searching. + */ + for (;;) { + if (blk->magic == XFS_DIR_LEAF_MAGIC) { + ASSERT(XFS_DIR_IS_V1(state->mp)); + retval = xfs_dir_leaf_lookup_int(blk->bp, state->args, + &blk->index); + } else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) { + ASSERT(XFS_DIR_IS_V2(state->mp)); + retval = xfs_dir2_leafn_lookup_int(blk->bp, state->args, + &blk->index, state); + } +#ifdef __KERNEL__ + else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { + retval = xfs_attr_leaf_lookup_int(blk->bp, state->args); + blk->index = state->args->index; + state->args->blkno = blk->blkno; + } +#endif + if (((retval == ENOENT) || (retval == ENOATTR)) && + (blk->hashval == state->args->hashval)) { + error = xfs_da_path_shift(state, &state->path, 1, 1, + &retval); + if (error) + return(error); + if (retval == 0) { + continue; + } +#ifdef __KERNEL__ + else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { + /* path_shift() gives ENOENT */ + retval = XFS_ERROR(ENOATTR); + } +#endif + } + break; + } + *result = retval; + return(0); +} + +/*======================================================================== + * Utility routines. + *========================================================================*/ + +/* + * Link a new block into a doubly linked list of blocks (of whatever type). + */ +int /* error */ +xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, + xfs_da_state_blk_t *new_blk) +{ + xfs_da_blkinfo_t *old_info, *new_info, *tmp_info; + xfs_da_args_t *args; + int before=0, error; + xfs_dabuf_t *bp; + + /* + * Set up environment. + */ + args = state->args; + ASSERT(args != NULL); + old_info = old_blk->bp->data; + new_info = new_blk->bp->data; + ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC || + old_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp) || + old_blk->magic == XFS_ATTR_LEAF_MAGIC); + ASSERT(old_blk->magic == INT_GET(old_info->magic, ARCH_CONVERT)); + ASSERT(new_blk->magic == INT_GET(new_info->magic, ARCH_CONVERT)); + ASSERT(old_blk->magic == new_blk->magic); + + switch (old_blk->magic) { +#ifdef __KERNEL__ + case XFS_ATTR_LEAF_MAGIC: + before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp); + break; +#endif + case XFS_DIR_LEAF_MAGIC: + ASSERT(XFS_DIR_IS_V1(state->mp)); + before = xfs_dir_leaf_order(old_blk->bp, new_blk->bp); + break; + case XFS_DIR2_LEAFN_MAGIC: + ASSERT(XFS_DIR_IS_V2(state->mp)); + before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp); + break; + case XFS_DA_NODE_MAGIC: + before = xfs_da_node_order(old_blk->bp, new_blk->bp); + break; + } + + /* + * Link blocks in appropriate order. + */ + if (before) { + /* + * Link new block in before existing block. + */ + INT_SET(new_info->forw, ARCH_CONVERT, old_blk->blkno); + new_info->back = old_info->back; /* INT_: direct copy */ + if (INT_GET(old_info->back, ARCH_CONVERT)) { + error = xfs_da_read_buf(args->trans, args->dp, + INT_GET(old_info->back, + ARCH_CONVERT), -1, &bp, + args->whichfork); + if (error) + return(error); + ASSERT(bp != NULL); + tmp_info = bp->data; + ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT) == INT_GET(old_info->magic, ARCH_CONVERT)); + ASSERT(INT_GET(tmp_info->forw, ARCH_CONVERT) == old_blk->blkno); + INT_SET(tmp_info->forw, ARCH_CONVERT, new_blk->blkno); + xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); + xfs_da_buf_done(bp); + } + INT_SET(old_info->back, ARCH_CONVERT, new_blk->blkno); + } else { + /* + * Link new block in after existing block. + */ + new_info->forw = old_info->forw; /* INT_: direct copy */ + INT_SET(new_info->back, ARCH_CONVERT, old_blk->blkno); + if (INT_GET(old_info->forw, ARCH_CONVERT)) { + error = xfs_da_read_buf(args->trans, args->dp, + INT_GET(old_info->forw, ARCH_CONVERT), -1, &bp, + args->whichfork); + if (error) + return(error); + ASSERT(bp != NULL); + tmp_info = bp->data; + ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT) + == INT_GET(old_info->magic, ARCH_CONVERT)); + ASSERT(INT_GET(tmp_info->back, ARCH_CONVERT) + == old_blk->blkno); + INT_SET(tmp_info->back, ARCH_CONVERT, new_blk->blkno); + xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); + xfs_da_buf_done(bp); + } + INT_SET(old_info->forw, ARCH_CONVERT, new_blk->blkno); + } + + xfs_da_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1); + xfs_da_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1); + return(0); +} + +/* + * Compare two intermediate nodes for "order". + */ +STATIC int +xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp) +{ + xfs_da_intnode_t *node1, *node2; + + node1 = node1_bp->data; + node2 = node2_bp->data; + ASSERT((INT_GET(node1->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) && + (INT_GET(node2->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC)); + if ((INT_GET(node1->hdr.count, ARCH_CONVERT) > 0) && (INT_GET(node2->hdr.count, ARCH_CONVERT) > 0) && + ((INT_GET(node2->btree[ 0 ].hashval, ARCH_CONVERT) < + INT_GET(node1->btree[ 0 ].hashval, ARCH_CONVERT)) || + (INT_GET(node2->btree[ INT_GET(node2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) < + INT_GET(node1->btree[ INT_GET(node1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))) { + return(1); + } + return(0); +} + +/* + * Pick up the last hashvalue from an intermediate node. + */ +STATIC uint +xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count) +{ + xfs_da_intnode_t *node; + + node = bp->data; + ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + if (count) + *count = INT_GET(node->hdr.count, ARCH_CONVERT); + if (INT_ISZERO(node->hdr.count, ARCH_CONVERT)) + return(0); + return(INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)); +} + +/* + * Unlink a block from a doubly linked list of blocks. + */ +int /* error */ +xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, + xfs_da_state_blk_t *save_blk) +{ + xfs_da_blkinfo_t *drop_info, *save_info, *tmp_info; + xfs_da_args_t *args; + xfs_dabuf_t *bp; + int error; + + /* + * Set up environment. + */ + args = state->args; + ASSERT(args != NULL); + save_info = save_blk->bp->data; + drop_info = drop_blk->bp->data; + ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC || + save_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp) || + save_blk->magic == XFS_ATTR_LEAF_MAGIC); + ASSERT(save_blk->magic == INT_GET(save_info->magic, ARCH_CONVERT)); + ASSERT(drop_blk->magic == INT_GET(drop_info->magic, ARCH_CONVERT)); + ASSERT(save_blk->magic == drop_blk->magic); + ASSERT((INT_GET(save_info->forw, ARCH_CONVERT) == drop_blk->blkno) || + (INT_GET(save_info->back, ARCH_CONVERT) == drop_blk->blkno)); + ASSERT((INT_GET(drop_info->forw, ARCH_CONVERT) == save_blk->blkno) || + (INT_GET(drop_info->back, ARCH_CONVERT) == save_blk->blkno)); + + /* + * Unlink the leaf block from the doubly linked chain of leaves. + */ + if (INT_GET(save_info->back, ARCH_CONVERT) == drop_blk->blkno) { + save_info->back = drop_info->back; /* INT_: direct copy */ + if (INT_GET(drop_info->back, ARCH_CONVERT)) { + error = xfs_da_read_buf(args->trans, args->dp, + INT_GET(drop_info->back, + ARCH_CONVERT), -1, &bp, + args->whichfork); + if (error) + return(error); + ASSERT(bp != NULL); + tmp_info = bp->data; + ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT) == INT_GET(save_info->magic, ARCH_CONVERT)); + ASSERT(INT_GET(tmp_info->forw, ARCH_CONVERT) == drop_blk->blkno); + INT_SET(tmp_info->forw, ARCH_CONVERT, save_blk->blkno); + xfs_da_log_buf(args->trans, bp, 0, + sizeof(*tmp_info) - 1); + xfs_da_buf_done(bp); + } + } else { + save_info->forw = drop_info->forw; /* INT_: direct copy */ + if (INT_GET(drop_info->forw, ARCH_CONVERT)) { + error = xfs_da_read_buf(args->trans, args->dp, + INT_GET(drop_info->forw, ARCH_CONVERT), -1, &bp, + args->whichfork); + if (error) + return(error); + ASSERT(bp != NULL); + tmp_info = bp->data; + ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT) + == INT_GET(save_info->magic, ARCH_CONVERT)); + ASSERT(INT_GET(tmp_info->back, ARCH_CONVERT) + == drop_blk->blkno); + INT_SET(tmp_info->back, ARCH_CONVERT, save_blk->blkno); + xfs_da_log_buf(args->trans, bp, 0, + sizeof(*tmp_info) - 1); + xfs_da_buf_done(bp); + } + } + + xfs_da_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1); + return(0); +} + +/* + * Move a path "forward" or "!forward" one block at the current level. + * + * This routine will adjust a "path" to point to the next block + * "forward" (higher hashvalues) or "!forward" (lower hashvals) in the + * Btree, including updating pointers to the intermediate nodes between + * the new bottom and the root. + */ +int /* error */ +xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, + int forward, int release, int *result) +{ + xfs_da_state_blk_t *blk; + xfs_da_blkinfo_t *info; + xfs_da_intnode_t *node; + xfs_da_args_t *args; + xfs_dablk_t blkno=0; + int level, error; + + /* + * Roll up the Btree looking for the first block where our + * current index is not at the edge of the block. Note that + * we skip the bottom layer because we want the sibling block. + */ + args = state->args; + ASSERT(args != NULL); + ASSERT(path != NULL); + ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + level = (path->active-1) - 1; /* skip bottom layer in path */ + for (blk = &path->blk[level]; level >= 0; blk--, level--) { + ASSERT(blk->bp != NULL); + node = blk->bp->data; + ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + if (forward && (blk->index < INT_GET(node->hdr.count, ARCH_CONVERT)-1)) { + blk->index++; + blkno = INT_GET(node->btree[ blk->index ].before, ARCH_CONVERT); + break; + } else if (!forward && (blk->index > 0)) { + blk->index--; + blkno = INT_GET(node->btree[ blk->index ].before, ARCH_CONVERT); + break; + } + } + if (level < 0) { + *result = XFS_ERROR(ENOENT); /* we're out of our tree */ + ASSERT(args->oknoent); + return(0); + } + + /* + * Roll down the edge of the subtree until we reach the + * same depth we were at originally. + */ + for (blk++, level++; level < path->active; blk++, level++) { + /* + * Release the old block. + * (if it's dirty, trans won't actually let go) + */ + if (release) + xfs_da_brelse(args->trans, blk->bp); + + /* + * Read the next child block. + */ + blk->blkno = blkno; + error = xfs_da_read_buf(args->trans, args->dp, blkno, -1, + &blk->bp, args->whichfork); + if (error) + return(error); + ASSERT(blk->bp != NULL); + info = blk->bp->data; + ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC || + INT_GET(info->magic, ARCH_CONVERT) == XFS_DIRX_LEAF_MAGIC(state->mp) || + INT_GET(info->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC); + blk->magic = INT_GET(info->magic, ARCH_CONVERT); + if (INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) { + node = (xfs_da_intnode_t *)info; + blk->hashval = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT); + if (forward) + blk->index = 0; + else + blk->index = INT_GET(node->hdr.count, ARCH_CONVERT)-1; + blkno = INT_GET(node->btree[ blk->index ].before, ARCH_CONVERT); + } else { + ASSERT(level == path->active-1); + blk->index = 0; + switch(blk->magic) { +#ifdef __KERNEL__ + case XFS_ATTR_LEAF_MAGIC: + blk->hashval = xfs_attr_leaf_lasthash(blk->bp, + NULL); + break; +#endif + case XFS_DIR_LEAF_MAGIC: + ASSERT(XFS_DIR_IS_V1(state->mp)); + blk->hashval = xfs_dir_leaf_lasthash(blk->bp, + NULL); + break; + case XFS_DIR2_LEAFN_MAGIC: + ASSERT(XFS_DIR_IS_V2(state->mp)); + blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, + NULL); + break; + default: + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC || + blk->magic == + XFS_DIRX_LEAF_MAGIC(state->mp)); + break; + } + } + } + *result = 0; + return(0); +} + + +/*======================================================================== + * Utility routines. + *========================================================================*/ + +/* + * Implement a simple hash on a character string. + * Rotate the hash value by 7 bits, then XOR each character in. + * This is implemented with some source-level loop unrolling. + */ +xfs_dahash_t +xfs_da_hashname(uchar_t *name, int namelen) +{ + xfs_dahash_t hash; + +#define ROTL(x,y) (((x) << (y)) | ((x) >> (32 - (y)))) +#ifdef SLOWVERSION + /* + * This is the old one-byte-at-a-time version. + */ + for (hash = 0; namelen > 0; namelen--) { + hash = *name++ ^ ROTL(hash, 7); + } + return(hash); +#else + /* + * Do four characters at a time as long as we can. + */ + for (hash = 0; namelen >= 4; namelen -= 4, name += 4) { + hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^ + (name[3] << 0) ^ ROTL(hash, 7 * 4); + } + /* + * Now do the rest of the characters. + */ + switch (namelen) { + case 3: + return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^ + ROTL(hash, 7 * 3); + case 2: + return (name[0] << 7) ^ (name[1] << 0) ^ ROTL(hash, 7 * 2); + case 1: + return (name[0] << 0) ^ ROTL(hash, 7 * 1); + case 0: + return hash; + } + /* NOTREACHED */ +#endif +#undef ROTL + return 0; /* keep gcc happy */ +} + +/* + * Add a block to the btree ahead of the file. + * Return the new block number to the caller. + */ +int +xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno) +{ + xfs_fileoff_t bno, b; + xfs_bmbt_irec_t map; + xfs_bmbt_irec_t *mapp; + xfs_inode_t *dp; + int nmap, error, w, count, c, got, i, mapi; + xfs_fsize_t size; + xfs_trans_t *tp; + xfs_mount_t *mp; + + dp = args->dp; + mp = dp->i_mount; + w = args->whichfork; + tp = args->trans; + /* + * For new directories adjust the file offset and block count. + */ + if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp)) { + bno = mp->m_dirleafblk; + count = mp->m_dirblkfsbs; + } else { + bno = 0; + count = 1; + } + /* + * Find a spot in the file space to put the new block. + */ + if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, w))) { + return error; + } + if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp)) + ASSERT(bno >= mp->m_dirleafblk && bno < mp->m_dirfreeblk); + /* + * Try mapping it in one filesystem block. + */ + nmap = 1; + ASSERT(args->firstblock != NULL); + if ((error = xfs_bmapi(tp, dp, bno, count, + XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA| + XFS_BMAPI_CONTIG, + args->firstblock, args->total, &map, &nmap, + args->flist))) { + return error; + } + ASSERT(nmap <= 1); + if (nmap == 1) { + mapp = ↦ + mapi = 1; + } + /* + * If we didn't get it and the block might work if fragmented, + * try without the CONTIG flag. Loop until we get it all. + */ + else if (nmap == 0 && count > 1) { + mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP); + for (b = bno, mapi = 0; b < bno + count; ) { + nmap = MIN(XFS_BMAP_MAX_NMAP, count); + c = (int)(bno + count - b); + if ((error = xfs_bmapi(tp, dp, b, c, + XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE| + XFS_BMAPI_METADATA, + args->firstblock, args->total, + &mapp[mapi], &nmap, args->flist))) { + kmem_free(mapp, sizeof(*mapp) * count); + return error; + } + if (nmap < 1) + break; + mapi += nmap; + b = mapp[mapi - 1].br_startoff + + mapp[mapi - 1].br_blockcount; + } + } else { + mapi = 0; + mapp = NULL; + } + /* + * Count the blocks we got, make sure it matches the total. + */ + for (i = 0, got = 0; i < mapi; i++) + got += mapp[i].br_blockcount; + if (got != count || mapp[0].br_startoff != bno || + mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount != + bno + count) { + if (mapp != &map) + kmem_free(mapp, sizeof(*mapp) * count); + return XFS_ERROR(ENOSPC); + } + if (mapp != &map) + kmem_free(mapp, sizeof(*mapp) * count); + *new_blkno = (xfs_dablk_t)bno; + /* + * For version 1 directories, adjust the file size if it changed. + */ + if (w == XFS_DATA_FORK && XFS_DIR_IS_V1(mp)) { + ASSERT(mapi == 1); + if ((error = xfs_bmap_last_offset(tp, dp, &bno, w))) + return error; + size = XFS_FSB_TO_B(mp, bno); + if (size != dp->i_d.di_size) { + dp->i_d.di_size = size; + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + } + } + return 0; +} + +/* + * Ick. We need to always be able to remove a btree block, even + * if there's no space reservation because the filesystem is full. + * This is called if xfs_bunmapi on a btree block fails due to ENOSPC. + * It swaps the target block with the last block in the file. The + * last block in the file can always be removed since it can't cause + * a bmap btree split to do that. + */ +STATIC int +xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, + xfs_dabuf_t **dead_bufp) +{ + xfs_dablk_t dead_blkno, last_blkno, sib_blkno, par_blkno; + xfs_dabuf_t *dead_buf, *last_buf, *sib_buf, *par_buf; + xfs_fileoff_t lastoff; + xfs_inode_t *ip; + xfs_trans_t *tp; + xfs_mount_t *mp; + int error, w, entno, level, dead_level; + xfs_da_blkinfo_t *dead_info, *sib_info; + xfs_da_intnode_t *par_node, *dead_node; + xfs_dir_leafblock_t *dead_leaf; + xfs_dir2_leaf_t *dead_leaf2; + xfs_dahash_t dead_hash; + + dead_buf = *dead_bufp; + dead_blkno = *dead_blknop; + tp = args->trans; + ip = args->dp; + w = args->whichfork; + ASSERT(w == XFS_DATA_FORK); + mp = ip->i_mount; + if (XFS_DIR_IS_V2(mp)) { + lastoff = mp->m_dirfreeblk; + error = xfs_bmap_last_before(tp, ip, &lastoff, w); + } else + error = xfs_bmap_last_offset(tp, ip, &lastoff, w); + if (error) + return error; + if (lastoff == 0) + return XFS_ERROR(EFSCORRUPTED); + /* + * Read the last block in the btree space. + */ + last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs; + if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w))) + return error; + /* + * Copy the last block into the dead buffer and log it. + */ + bcopy(last_buf->data, dead_buf->data, mp->m_dirblksize); + xfs_da_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1); + dead_info = dead_buf->data; + /* + * Get values from the moved block. + */ + if (INT_GET(dead_info->magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC) { + ASSERT(XFS_DIR_IS_V1(mp)); + dead_leaf = (xfs_dir_leafblock_t *)dead_info; + dead_level = 0; + dead_hash = + INT_GET(dead_leaf->entries[INT_GET(dead_leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT); + } else if (INT_GET(dead_info->magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC) { + ASSERT(XFS_DIR_IS_V2(mp)); + dead_leaf2 = (xfs_dir2_leaf_t *)dead_info; + dead_level = 0; + dead_hash = INT_GET(dead_leaf2->ents[INT_GET(dead_leaf2->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT); + } else { + ASSERT(INT_GET(dead_info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + dead_node = (xfs_da_intnode_t *)dead_info; + dead_level = INT_GET(dead_node->hdr.level, ARCH_CONVERT); + dead_hash = INT_GET(dead_node->btree[INT_GET(dead_node->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT); + } + sib_buf = par_buf = NULL; + /* + * If the moved block has a left sibling, fix up the pointers. + */ + if ((sib_blkno = INT_GET(dead_info->back, ARCH_CONVERT))) { + if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) + goto done; + sib_info = sib_buf->data; + if (INT_GET(sib_info->forw, ARCH_CONVERT) != last_blkno || + INT_GET(sib_info->magic, ARCH_CONVERT) != INT_GET(dead_info->magic, ARCH_CONVERT)) { + error = XFS_ERROR(EFSCORRUPTED); + goto done; + } + INT_SET(sib_info->forw, ARCH_CONVERT, dead_blkno); + xfs_da_log_buf(tp, sib_buf, + XFS_DA_LOGRANGE(sib_info, &sib_info->forw, + sizeof(sib_info->forw))); + xfs_da_buf_done(sib_buf); + sib_buf = NULL; + } + /* + * If the moved block has a right sibling, fix up the pointers. + */ + if ((sib_blkno = INT_GET(dead_info->forw, ARCH_CONVERT))) { + if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) + goto done; + sib_info = sib_buf->data; + if ( INT_GET(sib_info->back, ARCH_CONVERT) != last_blkno + || INT_GET(sib_info->magic, ARCH_CONVERT) + != INT_GET(dead_info->magic, ARCH_CONVERT)) { + error = XFS_ERROR(EFSCORRUPTED); + goto done; + } + INT_SET(sib_info->back, ARCH_CONVERT, dead_blkno); + xfs_da_log_buf(tp, sib_buf, + XFS_DA_LOGRANGE(sib_info, &sib_info->back, + sizeof(sib_info->back))); + xfs_da_buf_done(sib_buf); + sib_buf = NULL; + } + par_blkno = XFS_DIR_IS_V1(mp) ? 0 : mp->m_dirleafblk; + level = -1; + /* + * Walk down the tree looking for the parent of the moved block. + */ + for (;;) { + if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) + goto done; + par_node = par_buf->data; + if (INT_GET(par_node->hdr.info.magic, ARCH_CONVERT) != XFS_DA_NODE_MAGIC || + (level >= 0 && level != INT_GET(par_node->hdr.level, ARCH_CONVERT) + 1)) { + error = XFS_ERROR(EFSCORRUPTED); + goto done; + } + level = INT_GET(par_node->hdr.level, ARCH_CONVERT); + for (entno = 0; + entno < INT_GET(par_node->hdr.count, ARCH_CONVERT) && + INT_GET(par_node->btree[entno].hashval, ARCH_CONVERT) < dead_hash; + entno++) + continue; + if (entno == INT_GET(par_node->hdr.count, ARCH_CONVERT)) { + error = XFS_ERROR(EFSCORRUPTED); + goto done; + } + par_blkno = INT_GET(par_node->btree[entno].before, ARCH_CONVERT); + if (level == dead_level + 1) + break; + xfs_da_brelse(tp, par_buf); + par_buf = NULL; + } + /* + * We're in the right parent block. + * Look for the right entry. + */ + for (;;) { + for (; + entno < INT_GET(par_node->hdr.count, ARCH_CONVERT) && + INT_GET(par_node->btree[entno].before, ARCH_CONVERT) != last_blkno; + entno++) + continue; + if (entno < INT_GET(par_node->hdr.count, ARCH_CONVERT)) + break; + par_blkno = INT_GET(par_node->hdr.info.forw, ARCH_CONVERT); + xfs_da_brelse(tp, par_buf); + par_buf = NULL; + if (par_blkno == 0) { + error = XFS_ERROR(EFSCORRUPTED); + goto done; + } + if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) + goto done; + par_node = par_buf->data; + if (INT_GET(par_node->hdr.level, ARCH_CONVERT) != level || + INT_GET(par_node->hdr.info.magic, ARCH_CONVERT) != XFS_DA_NODE_MAGIC) { + error = XFS_ERROR(EFSCORRUPTED); + goto done; + } + entno = 0; + } + /* + * Update the parent entry pointing to the moved block. + */ + INT_SET(par_node->btree[entno].before, ARCH_CONVERT, dead_blkno); + xfs_da_log_buf(tp, par_buf, + XFS_DA_LOGRANGE(par_node, &par_node->btree[entno].before, + sizeof(par_node->btree[entno].before))); + xfs_da_buf_done(par_buf); + xfs_da_buf_done(dead_buf); + *dead_blknop = last_blkno; + *dead_bufp = last_buf; + return 0; +done: + if (par_buf) + xfs_da_brelse(tp, par_buf); + if (sib_buf) + xfs_da_brelse(tp, sib_buf); + xfs_da_brelse(tp, last_buf); + return error; +} + +/* + * Remove a btree block from a directory or attribute. + */ +int +xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, + xfs_dabuf_t *dead_buf) +{ + xfs_inode_t *dp; + int done, error, w, count; + xfs_fileoff_t bno; + xfs_fsize_t size; + xfs_trans_t *tp; + xfs_mount_t *mp; + + dp = args->dp; + w = args->whichfork; + tp = args->trans; + mp = dp->i_mount; + if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp)) + count = mp->m_dirblkfsbs; + else + count = 1; + for (;;) { + /* + * Remove extents. If we get ENOSPC for a dir we have to move + * the last block to the place we want to kill. + */ + if ((error = xfs_bunmapi(tp, dp, dead_blkno, count, + XFS_BMAPI_AFLAG(w)|XFS_BMAPI_METADATA, + 0, args->firstblock, args->flist, + &done)) == ENOSPC) { + if (w != XFS_DATA_FORK) + goto done; + if ((error = xfs_da_swap_lastblock(args, &dead_blkno, + &dead_buf))) + goto done; + } else if (error) + goto done; + else + break; + } + ASSERT(done); + xfs_da_binval(tp, dead_buf); + /* + * Adjust the directory size for version 1. + */ + if (w == XFS_DATA_FORK && XFS_DIR_IS_V1(mp)) { + if ((error = xfs_bmap_last_offset(tp, dp, &bno, w))) + return error; + size = XFS_FSB_TO_B(dp->i_mount, bno); + if (size != dp->i_d.di_size) { + dp->i_d.di_size = size; + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + } + } + return 0; +done: + xfs_da_binval(tp, dead_buf); + return error; +} + +/* + * See if the mapping(s) for this btree block are valid, i.e. + * don't contain holes, are logically contiguous, and cover the whole range. + */ +STATIC int +xfs_da_map_covers_blocks( + int nmap, + xfs_bmbt_irec_t *mapp, + xfs_dablk_t bno, + int count) +{ + int i; + xfs_fileoff_t off; + + for (i = 0, off = bno; i < nmap; i++) { + if (mapp[i].br_startblock == HOLESTARTBLOCK || + mapp[i].br_startblock == DELAYSTARTBLOCK) { + return 0; + } + if (off != mapp[i].br_startoff) { + return 0; + } + off += mapp[i].br_blockcount; + } + return off == bno + count; +} + +/* + * Make a dabuf. + * Used for get_buf, read_buf, read_bufr, and reada_buf. + */ +STATIC int +xfs_da_do_buf( + xfs_trans_t *trans, + xfs_inode_t *dp, + xfs_dablk_t bno, + xfs_daddr_t *mappedbnop, + xfs_dabuf_t **bpp, + int whichfork, + int caller, + inst_t *ra) +{ + xfs_buf_t *bp = 0; + xfs_buf_t **bplist; + int error=0; + int i; + xfs_bmbt_irec_t map; + xfs_bmbt_irec_t *mapp; + xfs_daddr_t mappedbno; + xfs_mount_t *mp; + int nbplist=0; + int nfsb; + int nmap; + xfs_dabuf_t *rbp; + + mp = dp->i_mount; + if (whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(mp)) + nfsb = mp->m_dirblkfsbs; + else + nfsb = 1; + mappedbno = *mappedbnop; + /* + * Caller doesn't have a mapping. -2 means don't complain + * if we land in a hole. + */ + if (mappedbno == -1 || mappedbno == -2) { + /* + * Optimize the one-block case. + */ + if (nfsb == 1) { + xfs_fsblock_t fsb; + + if ((error = + xfs_bmapi_single(trans, dp, whichfork, &fsb, + (xfs_fileoff_t)bno))) { + return error; + } + mapp = ↦ + if (fsb == NULLFSBLOCK) { + nmap = 0; + } else { + map.br_startblock = fsb; + map.br_startoff = (xfs_fileoff_t)bno; + map.br_blockcount = 1; + nmap = 1; + } + } else { + mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP); + nmap = nfsb; + if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno, + nfsb, + XFS_BMAPI_METADATA | + XFS_BMAPI_AFLAG(whichfork), + NULL, 0, mapp, &nmap, NULL))) + goto exit0; + } + } else { + map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno); + map.br_startoff = (xfs_fileoff_t)bno; + map.br_blockcount = nfsb; + mapp = ↦ + nmap = 1; + } + if (!xfs_da_map_covers_blocks(nmap, mapp, bno, nfsb)) { + error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED); + goto exit0; + } + if (caller != 3 && nmap > 1) { + bplist = kmem_alloc(sizeof(*bplist) * nmap, KM_SLEEP); + nbplist = 0; + } else + bplist = NULL; + /* + * Turn the mapping(s) into buffer(s). + */ + for (i = 0; i < nmap; i++) { + int nmapped; + + mappedbno = XFS_FSB_TO_DADDR(mp, mapp[i].br_startblock); + if (i == 0) + *mappedbnop = mappedbno; + nmapped = (int)XFS_FSB_TO_BB(mp, mapp[i].br_blockcount); + switch (caller) { + case 0: + bp = xfs_trans_get_buf(trans, mp->m_ddev_targp, + mappedbno, nmapped, 0); + error = bp ? XFS_BUF_GETERROR(bp) : XFS_ERROR(EIO); + break; + case 1: +#ifndef __KERNEL__ + case 2: +#endif + bp = NULL; + error = xfs_trans_read_buf(mp, trans, mp->m_ddev_targp, + mappedbno, nmapped, 0, &bp); + break; +#ifdef __KERNEL__ + case 3: + xfs_baread(mp->m_ddev_targp, mappedbno, nmapped); + error = 0; + bp = NULL; + break; +#endif + } + if (error) { + if (bp) + xfs_trans_brelse(trans, bp); + goto exit1; + } + if (!bp) + continue; + if (caller == 1) { + if (whichfork == XFS_ATTR_FORK) { + XFS_BUF_SET_VTYPE_REF(bp, B_FS_ATTR_BTREE, + XFS_ATTR_BTREE_REF); + } else { + XFS_BUF_SET_VTYPE_REF(bp, B_FS_DIR_BTREE, + XFS_DIR_BTREE_REF); + } + } + if (bplist) { + bplist[nbplist++] = bp; + } + } + /* + * Build a dabuf structure. + */ + if (bplist) { + rbp = xfs_da_buf_make(nbplist, bplist, ra); + } else if (bp) + rbp = xfs_da_buf_make(1, &bp, ra); + else + rbp = NULL; + /* + * For read_buf, check the magic number. + */ + if (caller == 1) { + xfs_dir2_data_t *data; + xfs_dir2_free_t *free; + xfs_da_blkinfo_t *info; + uint magic, magic1; + + info = rbp->data; + data = rbp->data; + free = rbp->data; + magic = INT_GET(info->magic, ARCH_CONVERT); + magic1 = INT_GET(data->hdr.magic, ARCH_CONVERT); + if (XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) && + (magic != XFS_DIR_LEAF_MAGIC) && + (magic != XFS_ATTR_LEAF_MAGIC) && + (magic != XFS_DIR2_LEAF1_MAGIC) && + (magic != XFS_DIR2_LEAFN_MAGIC) && + (magic1 != XFS_DIR2_BLOCK_MAGIC) && + (magic1 != XFS_DIR2_DATA_MAGIC) && + (INT_GET(free->hdr.magic, ARCH_CONVERT) != XFS_DIR2_FREE_MAGIC), + mp, XFS_ERRTAG_DA_READ_BUF, + XFS_RANDOM_DA_READ_BUF)) { + xfs_buftrace("DA READ ERROR", rbp->bps[0]); + error = XFS_ERROR(EFSCORRUPTED); + xfs_da_brelse(trans, rbp); + nbplist = 0; + goto exit1; + } + } + if (bplist) { + kmem_free(bplist, sizeof(*bplist) * nmap); + } + if (mapp != &map) { + kmem_free(mapp, sizeof(*mapp) * nfsb); + } + if (bpp) + *bpp = rbp; + return 0; +exit1: + if (bplist) { + for (i = 0; i < nbplist; i++) + xfs_trans_brelse(trans, bplist[i]); + kmem_free(bplist, sizeof(*bplist) * nmap); + } +exit0: + if (mapp != &map) + kmem_free(mapp, sizeof(*mapp) * nfsb); + if (bpp) + *bpp = NULL; + return error; +} + +/* + * Get a buffer for the dir/attr block. + */ +int +xfs_da_get_buf( + xfs_trans_t *trans, + xfs_inode_t *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + xfs_dabuf_t **bpp, + int whichfork) +{ + return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 0, + (inst_t *)__return_address); +} + +/* + * Get a buffer for the dir/attr block, fill in the contents. + */ +int +xfs_da_read_buf( + xfs_trans_t *trans, + xfs_inode_t *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + xfs_dabuf_t **bpp, + int whichfork) +{ + return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 1, + (inst_t *)__return_address); +} + +/* + * Readahead the dir/attr block. + */ +xfs_daddr_t +xfs_da_reada_buf( + xfs_trans_t *trans, + xfs_inode_t *dp, + xfs_dablk_t bno, + int whichfork) +{ + xfs_daddr_t rval; + + rval = -1; + if (xfs_da_do_buf(trans, dp, bno, &rval, NULL, whichfork, 3, + (inst_t *)__return_address)) + return -1; + else + return rval; +} + +/* + * Calculate the number of bits needed to hold i different values. + */ +uint +xfs_da_log2_roundup(uint i) +{ + uint rval; + + for (rval = 0; rval < NBBY * sizeof(i); rval++) { + if ((1 << rval) >= i) + break; + } + return(rval); +} + +kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ +kmem_zone_t *xfs_dabuf_zone; /* dabuf zone */ + +/* + * Allocate a dir-state structure. + * We don't put them on the stack since they're large. + */ +xfs_da_state_t * +xfs_da_state_alloc(void) +{ + return kmem_zone_zalloc(xfs_da_state_zone, KM_SLEEP); +} + +/* + * Kill the altpath contents of a da-state structure. + */ +void +xfs_da_state_kill_altpath(xfs_da_state_t *state) +{ + int i; + + for (i = 0; i < state->altpath.active; i++) { + if (state->altpath.blk[i].bp) { + if (state->altpath.blk[i].bp != state->path.blk[i].bp) + xfs_da_buf_done(state->altpath.blk[i].bp); + state->altpath.blk[i].bp = NULL; + } + } + state->altpath.active = 0; +} + +/* + * Free a da-state structure. + */ +void +xfs_da_state_free(xfs_da_state_t *state) +{ + int i; + + xfs_da_state_kill_altpath(state); + for (i = 0; i < state->path.active; i++) { + if (state->path.blk[i].bp) + xfs_da_buf_done(state->path.blk[i].bp); + } + if (state->extravalid && state->extrablk.bp) + xfs_da_buf_done(state->extrablk.bp); +#ifdef DEBUG + bzero((char *)state, sizeof(*state)); +#endif /* DEBUG */ + kmem_zone_free(xfs_da_state_zone, state); +} + +#ifdef XFS_DABUF_DEBUG +xfs_dabuf_t *xfs_dabuf_global_list; +lock_t xfs_dabuf_global_lock; +#endif + +/* + * Create a dabuf. + */ +/* ARGSUSED */ +STATIC xfs_dabuf_t * +xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra) +{ + xfs_buf_t *bp; + xfs_dabuf_t *dabuf; + int i; + int off; + + if (nbuf == 1) + dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_SLEEP); + else + dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_SLEEP); + dabuf->dirty = 0; +#ifdef XFS_DABUF_DEBUG + dabuf->ra = ra; + dabuf->dev = XFS_BUF_TARGET_DEV(bps[0]); + dabuf->blkno = XFS_BUF_ADDR(bps[0]); +#endif + if (nbuf == 1) { + dabuf->nbuf = 1; + bp = bps[0]; + dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp)); + dabuf->data = XFS_BUF_PTR(bp); + dabuf->bps[0] = bp; + } else { + dabuf->nbuf = nbuf; + for (i = 0, dabuf->bbcount = 0; i < nbuf; i++) { + dabuf->bps[i] = bp = bps[i]; + dabuf->bbcount += BTOBB(XFS_BUF_COUNT(bp)); + } + dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP); + for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) { + bp = bps[i]; + bcopy(XFS_BUF_PTR(bp), (char *)dabuf->data + off, + XFS_BUF_COUNT(bp)); + } + } +#ifdef XFS_DABUF_DEBUG + { + SPLDECL(s); + xfs_dabuf_t *p; + + s = mutex_spinlock(&xfs_dabuf_global_lock); + for (p = xfs_dabuf_global_list; p; p = p->next) { + ASSERT(p->blkno != dabuf->blkno || + p->dev != dabuf->dev); + } + dabuf->prev = NULL; + if (xfs_dabuf_global_list) + xfs_dabuf_global_list->prev = dabuf; + dabuf->next = xfs_dabuf_global_list; + xfs_dabuf_global_list = dabuf; + mutex_spinunlock(&xfs_dabuf_global_lock, s); + } +#endif + return dabuf; +} + +/* + * Un-dirty a dabuf. + */ +STATIC void +xfs_da_buf_clean(xfs_dabuf_t *dabuf) +{ + xfs_buf_t *bp; + int i; + int off; + + if (dabuf->dirty) { + ASSERT(dabuf->nbuf > 1); + dabuf->dirty = 0; + for (i = off = 0; i < dabuf->nbuf; + i++, off += XFS_BUF_COUNT(bp)) { + bp = dabuf->bps[i]; + bcopy((char *)dabuf->data + off, XFS_BUF_PTR(bp), + XFS_BUF_COUNT(bp)); + } + } +} + +/* + * Release a dabuf. + */ +void +xfs_da_buf_done(xfs_dabuf_t *dabuf) +{ + ASSERT(dabuf); + ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); + if (dabuf->dirty) + xfs_da_buf_clean(dabuf); + if (dabuf->nbuf > 1) + kmem_free(dabuf->data, BBTOB(dabuf->bbcount)); +#ifdef XFS_DABUF_DEBUG + { + SPLDECL(s); + + s = mutex_spinlock(&xfs_dabuf_global_lock); + if (dabuf->prev) + dabuf->prev->next = dabuf->next; + else + xfs_dabuf_global_list = dabuf->next; + if (dabuf->next) + dabuf->next->prev = dabuf->prev; + mutex_spinunlock(&xfs_dabuf_global_lock, s); + } + bzero(dabuf, XFS_DA_BUF_SIZE(dabuf->nbuf)); +#endif + if (dabuf->nbuf == 1) + kmem_zone_free(xfs_dabuf_zone, dabuf); + else + kmem_free(dabuf, XFS_DA_BUF_SIZE(dabuf->nbuf)); +} + +/* + * Log transaction from a dabuf. + */ +void +xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last) +{ + xfs_buf_t *bp; + uint f; + int i; + uint l; + int off; + + ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); + if (dabuf->nbuf == 1) { + ASSERT(dabuf->data == (void *)XFS_BUF_PTR(dabuf->bps[0])); + xfs_trans_log_buf(tp, dabuf->bps[0], first, last); + return; + } + dabuf->dirty = 1; + ASSERT(first <= last); + for (i = off = 0; i < dabuf->nbuf; i++, off += XFS_BUF_COUNT(bp)) { + bp = dabuf->bps[i]; + f = off; + l = f + XFS_BUF_COUNT(bp) - 1; + if (f < first) + f = first; + if (l > last) + l = last; + if (f <= l) + xfs_trans_log_buf(tp, bp, f - off, l - off); + /* + * B_DONE is set by xfs_trans_log buf. + * If we don't set it on a new buffer (get not read) + * then if we don't put anything in the buffer it won't + * be set, and at commit it it released into the cache, + * and then a read will fail. + */ + else if (!(XFS_BUF_ISDONE(bp))) + XFS_BUF_DONE(bp); + } + ASSERT(last < off); +} + +/* + * Release dabuf from a transaction. + * Have to free up the dabuf before the buffers are released, + * since the synchronization on the dabuf is really the lock on the buffer. + */ +void +xfs_da_brelse(xfs_trans_t *tp, xfs_dabuf_t *dabuf) +{ + xfs_buf_t *bp; + xfs_buf_t **bplist; + int i; + int nbuf; + + ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); + if ((nbuf = dabuf->nbuf) == 1) { + bplist = &bp; + bp = dabuf->bps[0]; + } else { + bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP); + bcopy(dabuf->bps, bplist, nbuf * sizeof(*bplist)); + } + xfs_da_buf_done(dabuf); + for (i = 0; i < nbuf; i++) + xfs_trans_brelse(tp, bplist[i]); + if (bplist != &bp) + kmem_free(bplist, nbuf * sizeof(*bplist)); +} + +/* + * Invalidate dabuf from a transaction. + */ +void +xfs_da_binval(xfs_trans_t *tp, xfs_dabuf_t *dabuf) +{ + xfs_buf_t *bp; + xfs_buf_t **bplist; + int i; + int nbuf; + + ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); + if ((nbuf = dabuf->nbuf) == 1) { + bplist = &bp; + bp = dabuf->bps[0]; + } else { + bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP); + bcopy(dabuf->bps, bplist, nbuf * sizeof(*bplist)); + } + xfs_da_buf_done(dabuf); + for (i = 0; i < nbuf; i++) + xfs_trans_binval(tp, bplist[i]); + if (bplist != &bp) + kmem_free(bplist, nbuf * sizeof(*bplist)); +} + +/* + * Get the first daddr from a dabuf. + */ +xfs_daddr_t +xfs_da_blkno(xfs_dabuf_t *dabuf) +{ + ASSERT(dabuf->nbuf); + ASSERT(dabuf->data); + return XFS_BUF_ADDR(dabuf->bps[0]); +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_da_btree.h linux-2.4-xfs/fs/xfs/xfs_da_btree.h --- linux-2.4.19/fs/xfs/xfs_da_btree.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_da_btree.h Fri Aug 9 11:20:51 2002 @@ -0,0 +1,343 @@ +/* + * Copyright (c) 2000, 2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_DA_BTREE_H__ +#define __XFS_DA_BTREE_H__ + +struct xfs_buf; +struct xfs_bmap_free; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; +struct zone; + +/*======================================================================== + * Directory Structure when greater than XFS_LBSIZE(mp) bytes. + *========================================================================*/ + +/* + * This structure is common to both leaf nodes and non-leaf nodes in the Btree. + * + * Is is used to manage a doubly linked list of all blocks at the same + * level in the Btree, and to identify which type of block this is. + */ +#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */ +#define XFS_DIR_LEAF_MAGIC 0xfeeb /* magic number: directory leaf blks */ +#define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */ +#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */ +#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */ + +#define XFS_DIRX_LEAF_MAGIC(mp) \ + (XFS_DIR_IS_V1(mp) ? XFS_DIR_LEAF_MAGIC : XFS_DIR2_LEAFN_MAGIC) + +typedef struct xfs_da_blkinfo { + xfs_dablk_t forw; /* previous block in list */ + xfs_dablk_t back; /* following block in list */ + __uint16_t magic; /* validity check on block */ + __uint16_t pad; /* unused */ +} xfs_da_blkinfo_t; + +/* + * This is the structure of the root and intermediate nodes in the Btree. + * The leaf nodes are defined above. + * + * Entries are not packed. + * + * Since we have duplicate keys, use a binary search but always follow + * all match in the block, not just the first match found. + */ +#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */ + +typedef struct xfs_da_intnode { + struct xfs_da_node_hdr { /* constant-structure header block */ + xfs_da_blkinfo_t info; /* block type, links, etc. */ + __uint16_t count; /* count of active entries */ + __uint16_t level; /* level above leaves (leaf == 0) */ + } hdr; + struct xfs_da_node_entry { + xfs_dahash_t hashval; /* hash value for this descendant */ + xfs_dablk_t before; /* Btree block before this key */ + } btree[1]; /* variable sized array of keys */ +} xfs_da_intnode_t; +typedef struct xfs_da_node_hdr xfs_da_node_hdr_t; +typedef struct xfs_da_node_entry xfs_da_node_entry_t; + +#define XFS_DA_NODE_ENTSIZE_BYNAME /* space a name uses */ \ + (sizeof(xfs_da_node_entry_t)) +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_NODE_ENTRIES) +int xfs_da_node_entries(struct xfs_mount *mp); +#define XFS_DA_NODE_ENTRIES(mp) xfs_da_node_entries(mp) +#else +#define XFS_DA_NODE_ENTRIES(mp) ((mp)->m_da_node_ents) +#endif + +#define XFS_DA_MAXHASH ((xfs_dahash_t)-1) /* largest valid hash value */ + +/* + * Macros used by directory code to interface to the filesystem. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LBSIZE) +int xfs_lbsize(struct xfs_mount *mp); +#define XFS_LBSIZE(mp) xfs_lbsize(mp) +#else +#define XFS_LBSIZE(mp) ((mp)->m_sb.sb_blocksize) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LBLOG) +int xfs_lblog(struct xfs_mount *mp); +#define XFS_LBLOG(mp) xfs_lblog(mp) +#else +#define XFS_LBLOG(mp) ((mp)->m_sb.sb_blocklog) +#endif + +/* + * Macros used by directory code to interface to the kernel + */ + +/* + * Macros used to manipulate directory off_t's + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_MAKE_BNOENTRY) +__uint32_t xfs_da_make_bnoentry(struct xfs_mount *mp, xfs_dablk_t bno, + int entry); +#define XFS_DA_MAKE_BNOENTRY(mp,bno,entry) \ + xfs_da_make_bnoentry(mp,bno,entry) +#else +#define XFS_DA_MAKE_BNOENTRY(mp,bno,entry) \ + (((bno) << (mp)->m_dircook_elog) | (entry)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_MAKE_COOKIE) +xfs_off_t xfs_da_make_cookie(struct xfs_mount *mp, xfs_dablk_t bno, int entry, + xfs_dahash_t hash); +#define XFS_DA_MAKE_COOKIE(mp,bno,entry,hash) \ + xfs_da_make_cookie(mp,bno,entry,hash) +#else +#define XFS_DA_MAKE_COOKIE(mp,bno,entry,hash) \ + (((xfs_off_t)XFS_DA_MAKE_BNOENTRY(mp, bno, entry) << 32) | (hash)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_COOKIE_HASH) +xfs_dahash_t xfs_da_cookie_hash(struct xfs_mount *mp, xfs_off_t cookie); +#define XFS_DA_COOKIE_HASH(mp,cookie) xfs_da_cookie_hash(mp,cookie) +#else +#define XFS_DA_COOKIE_HASH(mp,cookie) ((xfs_dahash_t)(cookie)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_COOKIE_BNO) +xfs_dablk_t xfs_da_cookie_bno(struct xfs_mount *mp, xfs_off_t cookie); +#define XFS_DA_COOKIE_BNO(mp,cookie) xfs_da_cookie_bno(mp,cookie) +#else +#define XFS_DA_COOKIE_BNO(mp,cookie) \ + (((xfs_off_t)(cookie) >> 31) == -1LL ? \ + (xfs_dablk_t)0 : \ + (xfs_dablk_t)((xfs_off_t)(cookie) >> ((mp)->m_dircook_elog + 32))) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_COOKIE_ENTRY) +int xfs_da_cookie_entry(struct xfs_mount *mp, xfs_off_t cookie); +#define XFS_DA_COOKIE_ENTRY(mp,cookie) xfs_da_cookie_entry(mp,cookie) +#else +#define XFS_DA_COOKIE_ENTRY(mp,cookie) \ + (((xfs_off_t)(cookie) >> 31) == -1LL ? \ + (xfs_dablk_t)0 : \ + (xfs_dablk_t)(((xfs_off_t)(cookie) >> 32) & \ + ((1 << (mp)->m_dircook_elog) - 1))) +#endif + + +/*======================================================================== + * Btree searching and modification structure definitions. + *========================================================================*/ + +/* + * Structure to ease passing around component names. + */ +typedef struct xfs_da_args { + uchar_t *name; /* string (maybe not NULL terminated) */ + int namelen; /* length of string (maybe no NULL) */ + uchar_t *value; /* set of bytes (maybe contain NULLs) */ + int valuelen; /* length of value */ + int flags; /* argument flags (eg: ATTR_NOCREATE) */ + xfs_dahash_t hashval; /* hash value of name */ + xfs_ino_t inumber; /* input/output inode number */ + struct xfs_inode *dp; /* directory inode to manipulate */ + xfs_fsblock_t *firstblock; /* ptr to firstblock for bmap calls */ + struct xfs_bmap_free *flist; /* ptr to freelist for bmap_finish */ + struct xfs_trans *trans; /* current trans (changes over time) */ + xfs_extlen_t total; /* total blocks needed, for 1st bmap */ + int whichfork; /* data or attribute fork */ + xfs_dablk_t blkno; /* blkno of attr leaf of interest */ + int index; /* index of attr of interest in blk */ + xfs_dablk_t rmtblkno; /* remote attr value starting blkno */ + int rmtblkcnt; /* remote attr value block count */ + int rename; /* T/F: this is an atomic rename op */ + xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */ + int index2; /* index of 2nd attr in blk */ + xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ + int rmtblkcnt2; /* remote attr value block count */ + int justcheck; /* check for ok with no space */ + int addname; /* T/F: this is an add operation */ + int oknoent; /* T/F: ok to return ENOENT, else die */ +} xfs_da_args_t; + +/* + * Structure to describe buffer(s) for a block. + * This is needed in the directory version 2 format case, when + * multiple non-contiguous fsblocks might be needed to cover one + * logical directory block. + * If the buffer count is 1 then the data pointer points to the + * same place as the b_addr field for the buffer, else to kmem_alloced memory. + */ +typedef struct xfs_dabuf { + int nbuf; /* number of buffer pointers present */ + short dirty; /* data needs to be copied back */ + short bbcount; /* how large is data in bbs */ + void *data; /* pointer for buffers' data */ +#ifdef XFS_DABUF_DEBUG + inst_t *ra; /* return address of caller to make */ + struct xfs_dabuf *next; /* next in global chain */ + struct xfs_dabuf *prev; /* previous in global chain */ + dev_t dev; /* device for buffer */ + xfs_daddr_t blkno; /* daddr first in bps[0] */ +#endif + struct xfs_buf *bps[1]; /* actually nbuf of these */ +} xfs_dabuf_t; +#define XFS_DA_BUF_SIZE(n) \ + (sizeof(xfs_dabuf_t) + sizeof(struct xfs_buf *) * ((n) - 1)) + +#ifdef XFS_DABUF_DEBUG +extern xfs_dabuf_t *xfs_dabuf_global_list; +#endif + +/* + * Storage for holding state during Btree searches and split/join ops. + * + * Only need space for 5 intermediate nodes. With a minimum of 62-way + * fanout to the Btree, we can support over 900 million directory blocks, + * which is slightly more than enough. + */ +typedef struct xfs_da_state_blk { + xfs_dabuf_t *bp; /* buffer containing block */ + xfs_dablk_t blkno; /* filesystem blkno of buffer */ + xfs_daddr_t disk_blkno; /* on-disk blkno (in BBs) of buffer */ + int index; /* relevant index into block */ + xfs_dahash_t hashval; /* last hash value in block */ + int magic; /* blk's magic number, ie: blk type */ +} xfs_da_state_blk_t; + +typedef struct xfs_da_state_path { + int active; /* number of active levels */ + xfs_da_state_blk_t blk[XFS_DA_NODE_MAXDEPTH]; +} xfs_da_state_path_t; + +typedef struct xfs_da_state { + xfs_da_args_t *args; /* filename arguments */ + struct xfs_mount *mp; /* filesystem mount point */ + int blocksize; /* logical block size */ + int inleaf; /* insert into 1->lf, 0->splf */ + xfs_da_state_path_t path; /* search/split paths */ + xfs_da_state_path_t altpath; /* alternate path for join */ + int extravalid; /* T/F: extrablk is in use */ + int extraafter; /* T/F: extrablk is after new */ + xfs_da_state_blk_t extrablk; /* for double-splits on leafs */ + /* for dirv2 extrablk is data */ +} xfs_da_state_t; + +/* + * Utility macros to aid in logging changed structure fields. + */ +#define XFS_DA_LOGOFF(BASE, ADDR) ((char *)(ADDR) - (char *)(BASE)) +#define XFS_DA_LOGRANGE(BASE, ADDR, SIZE) \ + (uint)(XFS_DA_LOGOFF(BASE, ADDR)), \ + (uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1) + + +#ifdef __KERNEL__ +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Routines used for growing the Btree. + */ +int xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, + xfs_dabuf_t **bpp, int whichfork); +int xfs_da_split(xfs_da_state_t *state); + +/* + * Routines used for shrinking the Btree. + */ +int xfs_da_join(xfs_da_state_t *state); +void xfs_da_fixhashpath(xfs_da_state_t *state, + xfs_da_state_path_t *path_to_to_fix); + +/* + * Routines used for finding things in the Btree. + */ +int xfs_da_node_lookup_int(xfs_da_state_t *state, int *result); +int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, + int forward, int release, int *result); +/* + * Utility routines. + */ +int xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, + xfs_da_state_blk_t *save_blk); +int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, + xfs_da_state_blk_t *new_blk); + +/* + * Utility routines. + */ +int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno); +int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + xfs_dabuf_t **bp, int whichfork); +int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + xfs_dabuf_t **bpp, int whichfork); +xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, + xfs_dablk_t bno, int whichfork); +int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, + xfs_dabuf_t *dead_buf); + +uint xfs_da_hashname(uchar_t *name_string, int name_length); +uint xfs_da_log2_roundup(uint i); +xfs_da_state_t *xfs_da_state_alloc(void); +void xfs_da_state_free(xfs_da_state_t *state); +void xfs_da_state_kill_altpath(xfs_da_state_t *state); + +void xfs_da_buf_done(xfs_dabuf_t *dabuf); +void xfs_da_log_buf(struct xfs_trans *tp, xfs_dabuf_t *dabuf, uint first, + uint last); +void xfs_da_brelse(struct xfs_trans *tp, xfs_dabuf_t *dabuf); +void xfs_da_binval(struct xfs_trans *tp, xfs_dabuf_t *dabuf); +xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf); + +extern struct kmem_zone *xfs_da_state_zone; +#endif /* __KERNEL__ */ + +#endif /* __XFS_DA_BTREE_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_dfrag.c linux-2.4-xfs/fs/xfs/xfs_dfrag.c --- linux-2.4.19/fs/xfs/xfs_dfrag.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_dfrag.c Thu Sep 5 15:35:08 2002 @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include +#include + +/* + * Syssgi interface for swapext + */ +int +xfs_swapext( + xfs_swapext_t *sxp) +{ + xfs_swapext_t sx; + xfs_inode_t *ip=NULL, *tip=NULL, *ips[2]; + xfs_trans_t *tp; + xfs_mount_t *mp; + xfs_bstat_t *sbp; + struct file *fp = NULL, *tfp = NULL; + vnode_t *vp, *tvp; + bhv_desc_t *bdp, *tbdp; + vn_bhv_head_t *bhp, *tbhp; + uint lock_flags=0; + int ilf_fields, tilf_fields; + int error = 0; + xfs_ifork_t tempif, *ifp, *tifp; + __uint64_t tmp; + int aforkblks = 0; + int taforkblks = 0; + int locked = 0; + + if (copy_from_user(&sx, sxp, sizeof(sx))) + return XFS_ERROR(EFAULT); + + /* Pull information for the target fd */ + if (((fp = fget((int)sx.sx_fdtarget)) == NULL) || + ((vp = LINVFS_GET_VP(fp->f_dentry->d_inode)) == NULL)) { + error = XFS_ERROR(EINVAL); + goto error0; + } + + bhp = VN_BHV_HEAD(vp); + VN_BHV_READ_LOCK(bhp); + bdp = vn_bhv_lookup(bhp, &xfs_vnodeops); + if (bdp == NULL) { + VN_BHV_READ_UNLOCK(bhp); + error = XFS_ERROR(EBADF); + goto error0; + } else { + ip = XFS_BHVTOI(bdp); + VN_BHV_READ_UNLOCK(bhp); + } + + if (((tfp = fget((int)sx.sx_fdtmp)) == NULL) || + ((tvp = LINVFS_GET_VP(tfp->f_dentry->d_inode)) == NULL)) { + error = XFS_ERROR(EINVAL); + goto error0; + } + + tbhp = VN_BHV_HEAD(tvp); + VN_BHV_READ_LOCK(tbhp); + tbdp = vn_bhv_lookup(tbhp, &xfs_vnodeops); + if (tbdp == NULL) { + VN_BHV_READ_UNLOCK(tbhp); + error = XFS_ERROR(EBADF); + goto error0; + } else { + tip = XFS_BHVTOI(tbdp); + VN_BHV_READ_UNLOCK(tbhp); + } + + if (ip->i_ino == tip->i_ino) { + error = XFS_ERROR(EINVAL); + goto error0; + } + + mp = ip->i_mount; + + sbp = &sx.sx_stat; + + if (XFS_FORCED_SHUTDOWN(mp)) { + error = XFS_ERROR(EIO); + goto error0; + } + + locked = 1; + + /* Lock in i_ino order */ + if (ip->i_ino < tip->i_ino) { + ips[0] = ip; + ips[1] = tip; + } else { + ips[0] = tip; + ips[1] = ip; + } + lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL; + xfs_lock_inodes(ips, 2, 0, lock_flags); + + /* Check permissions */ + if ((error = _MAC_XFS_IACCESS(ip, MACWRITE, NULL))) { + goto error0; + } + if ((error = _MAC_XFS_IACCESS(tip, MACWRITE, NULL))) { + goto error0; + } + if ((current->fsuid != ip->i_d.di_uid) && + (error = xfs_iaccess(ip, IWRITE, NULL)) && + !capable_cred(NULL, CAP_FOWNER)) { + goto error0; + } + if ((current->fsuid != tip->i_d.di_uid) && + (error = xfs_iaccess(tip, IWRITE, NULL)) && + !capable_cred(NULL, CAP_FOWNER)) { + goto error0; + } + + /* Verify both files are either real-time or non-realtime */ + if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != + (tip->i_d.di_flags & XFS_DIFLAG_REALTIME)) { + error = XFS_ERROR(EINVAL); + goto error0; + } + + /* Should never get a local format */ + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || + tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + error = XFS_ERROR(EINVAL); + goto error0; + } + + if (VN_CACHED(tvp) != 0) + xfs_inval_cached_pages(XFS_ITOV(tip), &(tip->i_iocore), + (loff_t)0, 0, 0); + + /* Verify O_DIRECT for ftmp */ + if (VN_CACHED(tvp) != 0) { + error = XFS_ERROR(EINVAL); + goto error0; + } + + /* Verify all data are being swapped */ + if (sx.sx_offset != 0 || + sx.sx_length != ip->i_d.di_size || + sx.sx_length != tip->i_d.di_size) { + error = XFS_ERROR(EFAULT); + goto error0; + } + + /* + * If the target has extended attributes, the tmp file + * must also in order to ensure the correct data fork + * format. + */ + if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) { + error = XFS_ERROR(EINVAL); + goto error0; + } + + /* + * Compare the current change & modify times with that + * passed in. If they differ, we abort this swap. + * This is the mechanism used to ensure the calling + * process that the file was not changed out from + * under it. + */ + if ((sbp->bs_ctime.tv_sec != ip->i_d.di_ctime.t_sec) || + (sbp->bs_ctime.tv_nsec != ip->i_d.di_ctime.t_nsec) || + (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) || + (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) { + error = XFS_ERROR(EBUSY); + goto error0; + } + + /* We need to fail if the file is memory mapped. Once we have tossed + * all existing pages, the page fault will have no option + * but to go to the filesystem for pages. By making the page fault call + * VOP_READ (or write in the case of autogrow) they block on the iolock + * until we have switched the extents. + */ + if (VN_MAPPED(vp)) { + error = XFS_ERROR(EBUSY); + goto error0; + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(tip, XFS_ILOCK_EXCL); + + /* + * There is a race condition here since we gave up the + * ilock. However, the data fork will not change since + * we have the iolock (locked for truncation too) so we + * are safe. We don't really care if non-io related + * fields change. + */ + + VOP_TOSS_PAGES(vp, 0, -1, FI_REMAPF); + + tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); + if ((error = xfs_trans_reserve(tp, 0, + XFS_ICHANGE_LOG_RES(mp), 0, + 0, 0))) { + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + xfs_iunlock(tip, XFS_IOLOCK_EXCL); + xfs_trans_cancel(tp, 0); + return error; + } + xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL); + + /* + * Count the number of extended attribute blocks + */ + if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && + (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { + error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks); + if (error) { + xfs_iunlock(ip, lock_flags); + xfs_iunlock(tip, lock_flags); + xfs_trans_cancel(tp, 0); + return error; + } + } + if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && + (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { + error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, + &taforkblks); + if (error) { + xfs_iunlock(ip, lock_flags); + xfs_iunlock(tip, lock_flags); + xfs_trans_cancel(tp, 0); + return error; + } + } + + /* + * Swap the data forks of the inodes + */ + ifp = &ip->i_df; + tifp = &tip->i_df; + tempif = *ifp; /* struct copy */ + *ifp = *tifp; /* struct copy */ + *tifp = tempif; /* struct copy */ + + /* + * Fix the on-disk inode values + */ + tmp = (__uint64_t)ip->i_d.di_nblocks; + ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks; + tip->i_d.di_nblocks = tmp + taforkblks - aforkblks; + + tmp = (__uint64_t) ip->i_d.di_nextents; + ip->i_d.di_nextents = tip->i_d.di_nextents; + tip->i_d.di_nextents = tmp; + + tmp = (__uint64_t) ip->i_d.di_format; + ip->i_d.di_format = tip->i_d.di_format; + tip->i_d.di_format = tmp; + + ilf_fields = XFS_ILOG_CORE; + + switch(ip->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + /* If the extents fit in the inode, fix the + * pointer. Otherwise it's already NULL or + * pointing to the extent. + */ + if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) { + ifp->if_u1.if_extents = + ifp->if_u2.if_inline_ext; + } + ilf_fields |= XFS_ILOG_DEXT; + break; + case XFS_DINODE_FMT_BTREE: + ilf_fields |= XFS_ILOG_DBROOT; + break; + } + + tilf_fields = XFS_ILOG_CORE; + + switch(tip->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + /* If the extents fit in the inode, fix the + * pointer. Otherwise it's already NULL or + * pointing to the extent. + */ + if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) { + tifp->if_u1.if_extents = + tifp->if_u2.if_inline_ext; + } + tilf_fields |= XFS_ILOG_DEXT; + break; + case XFS_DINODE_FMT_BTREE: + tilf_fields |= XFS_ILOG_DBROOT; + break; + } + + /* + * Increment vnode ref counts since xfs_trans_commit & + * xfs_trans_cancel will both unlock the inodes and + * decrement the associated ref counts. + */ + VN_HOLD(vp); + VN_HOLD(tvp); + + xfs_trans_ijoin(tp, ip, lock_flags); + xfs_trans_ijoin(tp, tip, lock_flags); + + xfs_trans_log_inode(tp, ip, ilf_fields); + xfs_trans_log_inode(tp, tip, tilf_fields); + + /* + * If this is a synchronous mount, make sure that the + * transaction goes to disk before returning to the user. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) { + xfs_trans_set_sync(tp); + } + + error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT, NULL); + + fput(fp); + fput(tfp); + + return error; + + error0: + if (locked) { + xfs_iunlock(ip, lock_flags); + xfs_iunlock(tip, lock_flags); + } + + if (fp != NULL) fput(fp); + if (tfp != NULL) fput(tfp); + + return error; +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_dfrag.h linux-2.4-xfs/fs/xfs/xfs_dfrag.h --- linux-2.4.19/fs/xfs/xfs_dfrag.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_dfrag.h Wed Jul 10 23:13:59 2002 @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_DFRAG_H__ +#define __XFS_DFRAG_H__ + +/* + * Structure passed to xfs_swapext + */ + +typedef struct xfs_swapext +{ + __int64_t sx_version; /* version */ + __int64_t sx_fdtarget; /* fd of target file */ + __int64_t sx_fdtmp; /* fd of tmp file */ + xfs_off_t sx_offset; /* offset into file */ + xfs_off_t sx_length; /* leng from offset */ + char sx_pad[16]; /* pad space, unused */ + xfs_bstat_t sx_stat; /* stat of target b4 copy */ +} xfs_swapext_t; + +/* + * Version flag + */ +#define XFS_SX_VERSION 0 + +#ifdef __KERNEL__ +/* + * Prototypes for visible xfs_dfrag.c routines. + */ + +/* + * Syscall interface for xfs_swapext + */ +int xfs_swapext(struct xfs_swapext *sx); + +#endif /* __KERNEL__ */ + +#endif /* __XFS_DFRAG_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_dinode.h linux-2.4-xfs/fs/xfs/xfs_dinode.h --- linux-2.4.19/fs/xfs/xfs_dinode.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_dinode.h Wed Jul 10 23:14:00 2002 @@ -0,0 +1,480 @@ +/* + * Copyright (c) 2000, 2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_DINODE_H__ +#define __XFS_DINODE_H__ + +struct xfs_buf; +struct xfs_mount; + +#define XFS_DINODE_VERSION_1 1 +#define XFS_DINODE_VERSION_2 2 +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DINODE_GOOD_VERSION) +int xfs_dinode_good_version(int v); +#define XFS_DINODE_GOOD_VERSION(v) xfs_dinode_good_version(v) +#else +#define XFS_DINODE_GOOD_VERSION(v) (((v) == XFS_DINODE_VERSION_1) || \ + ((v) == XFS_DINODE_VERSION_2)) +#endif +#define XFS_DINODE_MAGIC 0x494e /* 'IN' */ + +/* + * Disk inode structure. + * This is just the header; the inode is expanded to fill a variable size + * with the last field expanding. It is split into the core and "other" + * because we only need the core part in the in-core inode. + */ +typedef struct xfs_timestamp { + __int32_t t_sec; /* timestamp seconds */ + __int32_t t_nsec; /* timestamp nanoseconds */ +} xfs_timestamp_t; + +/* + * Note: Coordinate changes to this structure with the XFS_DI_* #defines + * below and the offsets table in xfs_ialloc_log_di(). + */ +typedef struct xfs_dinode_core +{ + __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ + __uint16_t di_mode; /* mode and type of file */ + __int8_t di_version; /* inode version */ + __int8_t di_format; /* format of di_c data */ + __uint16_t di_onlink; /* old number of links to file */ + __uint32_t di_uid; /* owner's user id */ + __uint32_t di_gid; /* owner's group id */ + __uint32_t di_nlink; /* number of links to file */ + __uint16_t di_projid; /* owner's project id */ + __uint8_t di_pad[10]; /* unused, zeroed space */ + xfs_timestamp_t di_atime; /* time last accessed */ + xfs_timestamp_t di_mtime; /* time last modified */ + xfs_timestamp_t di_ctime; /* time created/inode modified */ + xfs_fsize_t di_size; /* number of bytes in file */ + xfs_drfsbno_t di_nblocks; /* # of direct & btree blocks used */ + xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ + xfs_extnum_t di_nextents; /* number of extents in data fork */ + xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ + __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ + __int8_t di_aformat; /* format of attr fork's data */ + __uint32_t di_dmevmask; /* DMIG event mask */ + __uint16_t di_dmstate; /* DMIG state info */ + __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ + __uint32_t di_gen; /* generation number */ +} xfs_dinode_core_t; + +typedef struct xfs_dinode +{ + xfs_dinode_core_t di_core; + /* + * In adding anything between the core and the union, be + * sure to update the macros like XFS_LITINO below and + * XFS_BMAP_RBLOCK_DSIZE in xfs_bmap_btree.h. + */ + xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */ + union { + xfs_bmdr_block_t di_bmbt; /* btree root block */ + xfs_bmbt_rec_32_t di_bmx[1]; /* extent list */ + xfs_dir_shortform_t di_dirsf; /* shortform directory */ + xfs_dir2_sf_t di_dir2sf; /* shortform directory v2 */ + char di_c[1]; /* local contents */ + xfs_dev_t di_dev; /* device for IFCHR/IFBLK */ + uuid_t di_muuid; /* mount point value */ + char di_symlink[1]; /* local symbolic link */ + } di_u; + union { + xfs_bmdr_block_t di_abmbt; /* btree root block */ + xfs_bmbt_rec_32_t di_abmx[1]; /* extent list */ + xfs_attr_shortform_t di_attrsf; /* shortform attribute list */ + } di_a; +} xfs_dinode_t; + +/* + * The 32 bit link count in the inode theoretically maxes out at UINT_MAX. + * Since the pathconf interface is signed, we use 2^31 - 1 instead. + * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX. + */ +#define XFS_MAXLINK ((1U << 31) - 1U) +#define XFS_MAXLINK_1 65535U + +/* + * Bit names for logging disk inodes only + */ +#define XFS_DI_MAGIC 0x0000001 +#define XFS_DI_MODE 0x0000002 +#define XFS_DI_VERSION 0x0000004 +#define XFS_DI_FORMAT 0x0000008 +#define XFS_DI_ONLINK 0x0000010 +#define XFS_DI_UID 0x0000020 +#define XFS_DI_GID 0x0000040 +#define XFS_DI_NLINK 0x0000080 +#define XFS_DI_PROJID 0x0000100 +#define XFS_DI_PAD 0x0000200 +#define XFS_DI_ATIME 0x0000400 +#define XFS_DI_MTIME 0x0000800 +#define XFS_DI_CTIME 0x0001000 +#define XFS_DI_SIZE 0x0002000 +#define XFS_DI_NBLOCKS 0x0004000 +#define XFS_DI_EXTSIZE 0x0008000 +#define XFS_DI_NEXTENTS 0x0010000 +#define XFS_DI_NAEXTENTS 0x0020000 +#define XFS_DI_FORKOFF 0x0040000 +#define XFS_DI_AFORMAT 0x0080000 +#define XFS_DI_DMEVMASK 0x0100000 +#define XFS_DI_DMSTATE 0x0200000 +#define XFS_DI_FLAGS 0x0400000 +#define XFS_DI_GEN 0x0800000 +#define XFS_DI_NEXT_UNLINKED 0x1000000 +#define XFS_DI_U 0x2000000 +#define XFS_DI_A 0x4000000 +#define XFS_DI_NUM_BITS 27 +#define XFS_DI_ALL_BITS ((1 << XFS_DI_NUM_BITS) - 1) +#define XFS_DI_CORE_BITS (XFS_DI_ALL_BITS & ~(XFS_DI_U|XFS_DI_A)) + +/* + * Values for di_format + */ +typedef enum xfs_dinode_fmt +{ + XFS_DINODE_FMT_DEV, /* CHR, BLK: di_dev */ + XFS_DINODE_FMT_LOCAL, /* DIR, REG: di_c */ + /* LNK: di_symlink */ + XFS_DINODE_FMT_EXTENTS, /* DIR, REG, LNK: di_bmx */ + XFS_DINODE_FMT_BTREE, /* DIR, REG, LNK: di_bmbt */ + XFS_DINODE_FMT_UUID /* MNT: di_uuid */ +} xfs_dinode_fmt_t; + +/* + * Inode minimum and maximum sizes. + */ +#define XFS_DINODE_MIN_LOG 8 +#define XFS_DINODE_MAX_LOG 11 +#define XFS_DINODE_MIN_SIZE (1 << XFS_DINODE_MIN_LOG) +#define XFS_DINODE_MAX_SIZE (1 << XFS_DINODE_MAX_LOG) + +/* + * Inode size for given fs. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LITINO) +int xfs_litino(struct xfs_mount *mp); +#define XFS_LITINO(mp) xfs_litino(mp) +#else +#define XFS_LITINO(mp) ((mp)->m_litino) +#endif +#define XFS_BROOT_SIZE_ADJ \ + (sizeof(xfs_bmbt_block_t) - sizeof(xfs_bmdr_block_t)) + +/* + * Fork identifiers. Here so utilities can use them without including + * xfs_inode.h. + */ +#define XFS_DATA_FORK 0 +#define XFS_ATTR_FORK 1 + +/* + * Inode data & attribute fork sizes, per inode. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_Q) +int xfs_cfork_q_arch(xfs_dinode_core_t *dcp, xfs_arch_t arch); +int xfs_cfork_q(xfs_dinode_core_t *dcp); +#define XFS_CFORK_Q_ARCH(dcp,arch) xfs_cfork_q_arch(dcp,arch) +#define XFS_CFORK_Q(dcp) xfs_cfork_q(dcp) +#else +#define XFS_CFORK_Q_ARCH(dcp,arch) (!INT_ISZERO((dcp)->di_forkoff, arch)) +#define XFS_CFORK_Q(dcp) ((dcp)->di_forkoff != 0) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_BOFF) +int xfs_cfork_boff_arch(xfs_dinode_core_t *dcp, xfs_arch_t arch); +int xfs_cfork_boff(xfs_dinode_core_t *dcp); +#define XFS_CFORK_BOFF_ARCH(dcp,arch) xfs_cfork_boff_arch(dcp,arch) +#define XFS_CFORK_BOFF(dcp) xfs_cfork_boff(dcp) +#else +#define XFS_CFORK_BOFF_ARCH(dcp,arch) ((int)(INT_GET((dcp)->di_forkoff, arch) << 3)) +#define XFS_CFORK_BOFF(dcp) ((int)((dcp)->di_forkoff << 3)) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_DSIZE) +int xfs_cfork_dsize_arch(xfs_dinode_core_t *dcp, struct xfs_mount *mp, xfs_arch_t arch); +int xfs_cfork_dsize(xfs_dinode_core_t *dcp, struct xfs_mount *mp); +#define XFS_CFORK_DSIZE_ARCH(dcp,mp,arch) xfs_cfork_dsize_arch(dcp,mp,arch) +#define XFS_CFORK_DSIZE(dcp,mp) xfs_cfork_dsize(dcp,mp) +#else +#define XFS_CFORK_DSIZE_ARCH(dcp,mp,arch) \ + (XFS_CFORK_Q_ARCH(dcp, arch) ? XFS_CFORK_BOFF_ARCH(dcp, arch) : XFS_LITINO(mp)) +#define XFS_CFORK_DSIZE(dcp,mp) \ + (XFS_CFORK_Q(dcp) ? XFS_CFORK_BOFF(dcp) : XFS_LITINO(mp)) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_ASIZE) +int xfs_cfork_asize_arch(xfs_dinode_core_t *dcp, struct xfs_mount *mp, xfs_arch_t arch); +int xfs_cfork_asize(xfs_dinode_core_t *dcp, struct xfs_mount *mp); +#define XFS_CFORK_ASIZE_ARCH(dcp,mp,arch) xfs_cfork_asize_arch(dcp,mp,arch) +#define XFS_CFORK_ASIZE(dcp,mp) xfs_cfork_asize(dcp,mp) +#else +#define XFS_CFORK_ASIZE_ARCH(dcp,mp,arch) \ + (XFS_CFORK_Q_ARCH(dcp, arch) ? XFS_LITINO(mp) - XFS_CFORK_BOFF_ARCH(dcp, arch) : 0) +#define XFS_CFORK_ASIZE(dcp,mp) \ + (XFS_CFORK_Q(dcp) ? XFS_LITINO(mp) - XFS_CFORK_BOFF(dcp) : 0) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_SIZE) +int xfs_cfork_size_arch(xfs_dinode_core_t *dcp, struct xfs_mount *mp, int w, xfs_arch_t arch); +int xfs_cfork_size(xfs_dinode_core_t *dcp, struct xfs_mount *mp, int w); +#define XFS_CFORK_SIZE_ARCH(dcp,mp,w,arch) xfs_cfork_size_arch(dcp,mp,w,arch) +#define XFS_CFORK_SIZE(dcp,mp,w) xfs_cfork_size(dcp,mp,w) +#else +#define XFS_CFORK_SIZE_ARCH(dcp,mp,w,arch) \ + ((w) == XFS_DATA_FORK ? \ + XFS_CFORK_DSIZE_ARCH(dcp, mp, arch) : XFS_CFORK_ASIZE_ARCH(dcp, mp, arch)) +#define XFS_CFORK_SIZE(dcp,mp,w) \ + ((w) == XFS_DATA_FORK ? \ + XFS_CFORK_DSIZE(dcp, mp) : XFS_CFORK_ASIZE(dcp, mp)) + +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_DSIZE) +int xfs_dfork_dsize_arch(xfs_dinode_t *dip, struct xfs_mount *mp, xfs_arch_t arch); +int xfs_dfork_dsize(xfs_dinode_t *dip, struct xfs_mount *mp); +#define XFS_DFORK_DSIZE_ARCH(dip,mp,arch) xfs_dfork_dsize_arch(dip,mp,arch) +#define XFS_DFORK_DSIZE(dip,mp) xfs_dfork_dsize(dip,mp) +#else +#define XFS_DFORK_DSIZE_ARCH(dip,mp,arch) XFS_CFORK_DSIZE_ARCH(&(dip)->di_core, mp, arch) +#define XFS_DFORK_DSIZE(dip,mp) XFS_DFORK_DSIZE_ARCH(dip,mp,ARCH_NOCONVERT) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_ASIZE) +int xfs_dfork_asize_arch(xfs_dinode_t *dip, struct xfs_mount *mp, xfs_arch_t arch); +int xfs_dfork_asize(xfs_dinode_t *dip, struct xfs_mount *mp); +#define XFS_DFORK_ASIZE_ARCH(dip,mp,arch) xfs_dfork_asize_arch(dip,mp,arch) +#define XFS_DFORK_ASIZE(dip,mp) xfs_dfork_asize(dip,mp) +#else +#define XFS_DFORK_ASIZE_ARCH(dip,mp,arch) XFS_CFORK_ASIZE_ARCH(&(dip)->di_core, mp, arch) +#define XFS_DFORK_ASIZE(dip,mp) XFS_DFORK_ASIZE_ARCH(dip,mp,ARCH_NOCONVERT) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_SIZE) +int xfs_dfork_size_arch(xfs_dinode_t *dip, struct xfs_mount *mp, int w, xfs_arch_t arch); +int xfs_dfork_size(xfs_dinode_t *dip, struct xfs_mount *mp, int w); +#define XFS_DFORK_SIZE_ARCH(dip,mp,w,arch) xfs_dfork_size_arch(dip,mp,w,arch) +#define XFS_DFORK_SIZE(dip,mp,w) xfs_dfork_size(dip,mp,w) +#else +#define XFS_DFORK_SIZE_ARCH(dip,mp,w,arch) XFS_CFORK_SIZE_ARCH(&(dip)->di_core, mp, w, arch) +#define XFS_DFORK_SIZE(dip,mp,w) XFS_DFORK_SIZE_ARCH(dip,mp,w,ARCH_NOCONVERT) + +#endif + +/* + * Macros for accessing per-fork disk inode information. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_Q) +int xfs_dfork_q_arch(xfs_dinode_t *dip, xfs_arch_t arch); +int xfs_dfork_q(xfs_dinode_t *dip); +#define XFS_DFORK_Q_ARCH(dip,arch) xfs_dfork_q_arch(dip,arch) +#define XFS_DFORK_Q(dip) xfs_dfork_q(dip) +#else +#define XFS_DFORK_Q_ARCH(dip,arch) XFS_CFORK_Q_ARCH(&(dip)->di_core, arch) +#define XFS_DFORK_Q(dip) XFS_DFORK_Q_ARCH(dip,ARCH_NOCONVERT) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_BOFF) +int xfs_dfork_boff_arch(xfs_dinode_t *dip, xfs_arch_t arch); +int xfs_dfork_boff(xfs_dinode_t *dip); +#define XFS_DFORK_BOFF_ARCH(dip,arch) xfs_dfork_boff_arch(dip,arch) +#define XFS_DFORK_BOFF(dip) xfs_dfork_boff(dip) +#else +#define XFS_DFORK_BOFF_ARCH(dip,arch) XFS_CFORK_BOFF_ARCH(&(dip)->di_core, arch) +#define XFS_DFORK_BOFF(dip) XFS_DFORK_BOFF_ARCH(dip,ARCH_NOCONVERT) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_DPTR) +char *xfs_dfork_dptr_arch(xfs_dinode_t *dip, xfs_arch_t arch); +char *xfs_dfork_dptr(xfs_dinode_t *dip); +#define XFS_DFORK_DPTR_ARCH(dip,arch) xfs_dfork_dptr_arch(dip,arch) +#define XFS_DFORK_DPTR(dip) xfs_dfork_dptr(dip) +#else +#define XFS_DFORK_DPTR_ARCH(dip,arch) ((dip)->di_u.di_c) +#define XFS_DFORK_DPTR(dip) XFS_DFORK_DPTR_ARCH(dip,ARCH_NOCONVERT) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_APTR) +char *xfs_dfork_aptr_arch(xfs_dinode_t *dip, xfs_arch_t arch); +char *xfs_dfork_aptr(xfs_dinode_t *dip); +#define XFS_DFORK_APTR_ARCH(dip,arch) xfs_dfork_aptr_arch(dip,arch) +#define XFS_DFORK_APTR(dip) xfs_dfork_aptr(dip) +#else +#define XFS_DFORK_APTR_ARCH(dip,arch) ((dip)->di_u.di_c + XFS_DFORK_BOFF_ARCH(dip, arch)) +#define XFS_DFORK_APTR(dip) XFS_DFORK_APTR_ARCH(dip,ARCH_NOCONVERT) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_PTR) +char *xfs_dfork_ptr_arch(xfs_dinode_t *dip, int w, xfs_arch_t arch); +char *xfs_dfork_ptr(xfs_dinode_t *dip, int w); +#define XFS_DFORK_PTR_ARCH(dip,w,arch) xfs_dfork_ptr_arch(dip,w,arch) +#define XFS_DFORK_PTR(dip,w) xfs_dfork_ptr(dip,w) +#else +#define XFS_DFORK_PTR_ARCH(dip,w,arch) \ + ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR_ARCH(dip, arch) : XFS_DFORK_APTR_ARCH(dip, arch)) +#define XFS_DFORK_PTR(dip,w) XFS_DFORK_PTR_ARCH(dip,w,ARCH_NOCONVERT) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_FORMAT) +int xfs_cfork_format_arch(xfs_dinode_core_t *dcp, int w, xfs_arch_t arch); +int xfs_cfork_format(xfs_dinode_core_t *dcp, int w); +#define XFS_CFORK_FORMAT_ARCH(dcp,w,arch) xfs_cfork_format_arch(dcp,w,arch) +#define XFS_CFORK_FORMAT(dcp,w) xfs_cfork_format(dcp,w) +#else +#define XFS_CFORK_FORMAT_ARCH(dcp,w,arch) \ + ((w) == XFS_DATA_FORK ? INT_GET((dcp)->di_format, arch) : INT_GET((dcp)->di_aformat, arch)) +#define XFS_CFORK_FORMAT(dcp,w) XFS_CFORK_FORMAT_ARCH(dcp,w,ARCH_NOCONVERT) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_FMT_SET) +void xfs_cfork_fmt_set_arch(xfs_dinode_core_t *dcp, int w, int n, xfs_arch_t arch); +void xfs_cfork_fmt_set(xfs_dinode_core_t *dcp, int w, int n); +#define XFS_CFORK_FMT_SET_ARCH(dcp,w,n,arch) xfs_cfork_fmt_set_arch(dcp,w,n,arch) +#define XFS_CFORK_FMT_SET(dcp,w,n) xfs_cfork_fmt_set(dcp,w,n) +#else +#define XFS_CFORK_FMT_SET_ARCH(dcp,w,n,arch) \ + ((w) == XFS_DATA_FORK ? \ + (INT_SET((dcp)->di_format, arch, (n))) : \ + (INT_SET((dcp)->di_aformat, arch, (n)))) +#define XFS_CFORK_FMT_SET(dcp,w,n) XFS_CFORK_FMT_SET_ARCH(dcp,w,n,ARCH_NOCONVERT) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_NEXTENTS) +int xfs_cfork_nextents_arch(xfs_dinode_core_t *dcp, int w, xfs_arch_t arch); +int xfs_cfork_nextents(xfs_dinode_core_t *dcp, int w); +#define XFS_CFORK_NEXTENTS_ARCH(dcp,w,arch) xfs_cfork_nextents_arch(dcp,w,arch) +#define XFS_CFORK_NEXTENTS(dcp,w) xfs_cfork_nextents(dcp,w) +#else +#define XFS_CFORK_NEXTENTS_ARCH(dcp,w,arch) \ + ((w) == XFS_DATA_FORK ? INT_GET((dcp)->di_nextents, arch) : INT_GET((dcp)->di_anextents, arch)) +#define XFS_CFORK_NEXTENTS(dcp,w) XFS_CFORK_NEXTENTS_ARCH(dcp,w,ARCH_NOCONVERT) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_NEXT_SET) +void xfs_cfork_next_set_arch(xfs_dinode_core_t *dcp, int w, int n, xfs_arch_t arch); +void xfs_cfork_next_set(xfs_dinode_core_t *dcp, int w, int n); +#define XFS_CFORK_NEXT_SET_ARCH(dcp,w,n,arch) xfs_cfork_next_set_arch(dcp,w,n,arch) +#define XFS_CFORK_NEXT_SET(dcp,w,n) xfs_cfork_next_set(dcp,w,n) +#else +#define XFS_CFORK_NEXT_SET_ARCH(dcp,w,n,arch) \ + ((w) == XFS_DATA_FORK ? \ + (INT_SET((dcp)->di_nextents, arch, (n))) : \ + (INT_SET((dcp)->di_anextents, arch, (n)))) +#define XFS_CFORK_NEXT_SET(dcp,w,n) XFS_CFORK_NEXT_SET_ARCH(dcp,w,n,ARCH_NOCONVERT) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_FORMAT) +int xfs_dfork_format_arch(xfs_dinode_t *dip, int w, xfs_arch_t arch); +int xfs_dfork_format(xfs_dinode_t *dip, int w); +#define XFS_DFORK_FORMAT_ARCH(dip,w,arch) xfs_dfork_format_arch(dip,w,arch) +#define XFS_DFORK_FORMAT(dip,w) xfs_dfork_format(dip,w) +#else +#define XFS_DFORK_FORMAT_ARCH(dip,w,arch) XFS_CFORK_FORMAT_ARCH(&(dip)->di_core, w, arch) +#define XFS_DFORK_FORMAT(dip,w) XFS_DFORK_FORMAT_ARCH(dip,w,ARCH_NOCONVERT) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_FMT_SET) +void xfs_dfork_fmt_set_arch(xfs_dinode_t *dip, int w, int n, xfs_arch_t arch); +void xfs_dfork_fmt_set(xfs_dinode_t *dip, int w, int n); +#define XFS_DFORK_FMT_SET_ARCH(dip,w,n,arch) xfs_dfork_fmt_set_arch(dip,w,n,arch) +#define XFS_DFORK_FMT_SET(dip,w,n) xfs_dfork_fmt_set(dip,w,n) +#else +#define XFS_DFORK_FMT_SET_ARCH(dip,w,n,arch) XFS_CFORK_FMT_SET_ARCH(&(dip)->di_core, w, n, arch) +#define XFS_DFORK_FMT_SET(dip,w,n) XFS_DFORK_FMT_SET_ARCH(dip,w,n,ARCH_NOCONVERT) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_NEXTENTS) +int xfs_dfork_nextents_arch(xfs_dinode_t *dip, int w, xfs_arch_t arch); +int xfs_dfork_nextents(xfs_dinode_t *dip, int w); +#define XFS_DFORK_NEXTENTS_ARCH(dip,w,arch) xfs_dfork_nextents_arch(dip,w,arch) +#define XFS_DFORK_NEXTENTS(dip,w) xfs_dfork_nextents(dip,w) +#else +#define XFS_DFORK_NEXTENTS_ARCH(dip,w,arch) XFS_CFORK_NEXTENTS_ARCH(&(dip)->di_core, w, arch) +#define XFS_DFORK_NEXTENTS(dip,w) XFS_DFORK_NEXTENTS_ARCH(dip,w,ARCH_NOCONVERT) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_NEXT_SET) +void xfs_dfork_next_set_arch(xfs_dinode_t *dip, int w, int n, xfs_arch_t arch); +void xfs_dfork_next_set(xfs_dinode_t *dip, int w, int n); +#define XFS_DFORK_NEXT_SET_ARCH(dip,w,n,arch) xfs_dfork_next_set_arch(dip,w,n,arch) +#define XFS_DFORK_NEXT_SET(dip,w,n) xfs_dfork_next_set(dip,w,n) +#else +#define XFS_DFORK_NEXT_SET_ARCH(dip,w,n,arch) XFS_CFORK_NEXT_SET_ARCH(&(dip)->di_core, w, n, arch) +#define XFS_DFORK_NEXT_SET(dip,w,n) XFS_DFORK_NEXT_SET_ARCH(dip,w,n,ARCH_NOCONVERT) + +#endif + +/* + * File types (mode field) + */ +#define IFMT 0170000 /* type of file */ +#define IFIFO 0010000 /* named pipe (fifo) */ +#define IFCHR 0020000 /* character special */ +#define IFDIR 0040000 /* directory */ +#define IFBLK 0060000 /* block special */ +#define IFREG 0100000 /* regular */ +#define IFLNK 0120000 /* symbolic link */ +#define IFSOCK 0140000 /* socket */ +#define IFMNT 0160000 /* mount point */ + +/* + * File execution and access modes. + */ +#define ISUID 04000 /* set user id on execution */ +#define ISGID 02000 /* set group id on execution */ +#define ISVTX 01000 /* sticky directory */ +#define IREAD 0400 /* read, write, execute permissions */ +#define IWRITE 0200 +#define IEXEC 0100 + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_DINODE) +xfs_dinode_t *xfs_buf_to_dinode(struct xfs_buf *bp); +#define XFS_BUF_TO_DINODE(bp) xfs_buf_to_dinode(bp) +#else +#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)(XFS_BUF_PTR(bp))) +#endif + +/* + * Values for di_flags + * There should be a one-to-one correspondence between these flags and the + * XFS_XFLAG_s. + */ +#define XFS_DIFLAG_REALTIME_BIT 0 /* file's blocks come from rt area */ +#define XFS_DIFLAG_PREALLOC_BIT 1 /* file space has been preallocated */ +#define XFS_DIFLAG_NEWRTBM_BIT 2 /* for rtbitmap inode, new format */ +#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) +#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) +#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) +#define XFS_DIFLAG_ALL \ + (XFS_DIFLAG_REALTIME|XFS_DIFLAG_PREALLOC|XFS_DIFLAG_NEWRTBM) + +#endif /* __XFS_DINODE_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_dir.c linux-2.4-xfs/fs/xfs/xfs_dir.c --- linux-2.4.19/fs/xfs/xfs_dir.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_dir.c Tue Aug 6 16:48:19 2002 @@ -0,0 +1,1183 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include + + +/* + * xfs_dir.c + * + * Provide the external interfaces to manage directories. + */ + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Functions for the dirops interfaces. + */ +static void xfs_dir_mount(struct xfs_mount *mp); + +static int xfs_dir_isempty(struct xfs_inode *dp); + +static int xfs_dir_init(struct xfs_trans *trans, + struct xfs_inode *dir, + struct xfs_inode *parent_dir); + +static int xfs_dir_createname(struct xfs_trans *trans, + struct xfs_inode *dp, + char *name_string, + int name_len, + xfs_ino_t inode_number, + xfs_fsblock_t *firstblock, + xfs_bmap_free_t *flist, + xfs_extlen_t total); + +static int xfs_dir_lookup(struct xfs_trans *tp, + struct xfs_inode *dp, + char *name_string, + int name_length, + xfs_ino_t *inode_number); + +static int xfs_dir_removename(struct xfs_trans *trans, + struct xfs_inode *dp, + char *name_string, + int name_length, + xfs_ino_t ino, + xfs_fsblock_t *firstblock, + xfs_bmap_free_t *flist, + xfs_extlen_t total); + +static int xfs_dir_getdents(struct xfs_trans *tp, + struct xfs_inode *dp, + struct uio *uiop, + int *eofp); + +static int xfs_dir_replace(struct xfs_trans *tp, + struct xfs_inode *dp, + char *name_string, + int name_length, + xfs_ino_t inode_number, + xfs_fsblock_t *firstblock, + xfs_bmap_free_t *flist, + xfs_extlen_t total); + +static int xfs_dir_canenter(struct xfs_trans *tp, + struct xfs_inode *dp, + char *name_string, + int name_length); + +static int xfs_dir_shortform_validate_ondisk(xfs_mount_t *mp, + xfs_dinode_t *dip); + +xfs_dirops_t xfsv1_dirops = { + .xd_mount = xfs_dir_mount, + .xd_isempty = xfs_dir_isempty, + .xd_init = xfs_dir_init, + .xd_createname = xfs_dir_createname, + .xd_lookup = xfs_dir_lookup, + .xd_removename = xfs_dir_removename, + .xd_getdents = xfs_dir_getdents, + .xd_replace = xfs_dir_replace, + .xd_canenter = xfs_dir_canenter, + .xd_shortform_validate_ondisk = xfs_dir_shortform_validate_ondisk, + .xd_shortform_to_single = xfs_dir_shortform_to_leaf, +}; + +/* + * Internal routines when dirsize == XFS_LBSIZE(mp). + */ +STATIC int xfs_dir_leaf_lookup(xfs_da_args_t *args); +STATIC int xfs_dir_leaf_removename(xfs_da_args_t *args, int *number_entries, + int *total_namebytes); +STATIC int xfs_dir_leaf_getdents(xfs_trans_t *trans, xfs_inode_t *dp, + uio_t *uio, int *eofp, + xfs_dirent_t *dbp, + xfs_dir_put_t put); +STATIC int xfs_dir_leaf_replace(xfs_da_args_t *args); + +/* + * Internal routines when dirsize > XFS_LBSIZE(mp). + */ +STATIC int xfs_dir_node_addname(xfs_da_args_t *args); +STATIC int xfs_dir_node_lookup(xfs_da_args_t *args); +STATIC int xfs_dir_node_removename(xfs_da_args_t *args); +STATIC int xfs_dir_node_getdents(xfs_trans_t *trans, xfs_inode_t *dp, + uio_t *uio, int *eofp, + xfs_dirent_t *dbp, + xfs_dir_put_t put); +STATIC int xfs_dir_node_replace(xfs_da_args_t *args); + +#if defined(DEBUG) +ktrace_t *xfs_dir_trace_buf; +#endif + + +/*======================================================================== + * Overall external interface routines. + *========================================================================*/ + +xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot; + +/* + * One-time startup routine called from xfs_init(). + */ +void +xfs_dir_startup(void) +{ + xfs_dir_hash_dot = xfs_da_hashname(".", 1); + xfs_dir_hash_dotdot = xfs_da_hashname("..", 2); +} + +/* + * Initialize directory-related fields in the mount structure. + */ +static void +xfs_dir_mount(xfs_mount_t *mp) +{ + uint shortcount, leafcount, count; + + mp->m_dirversion = 1; + shortcount = (mp->m_attroffset - (uint)sizeof(xfs_dir_sf_hdr_t)) / + (uint)sizeof(xfs_dir_sf_entry_t); + leafcount = (XFS_LBSIZE(mp) - (uint)sizeof(xfs_dir_leaf_hdr_t)) / + ((uint)sizeof(xfs_dir_leaf_entry_t) + + (uint)sizeof(xfs_dir_leaf_name_t)); + count = shortcount > leafcount ? shortcount : leafcount; + mp->m_dircook_elog = xfs_da_log2_roundup(count + 1); + ASSERT(mp->m_dircook_elog <= mp->m_sb.sb_blocklog); + mp->m_da_node_ents = + (XFS_LBSIZE(mp) - (uint)sizeof(xfs_da_node_hdr_t)) / + (uint)sizeof(xfs_da_node_entry_t); + mp->m_dir_magicpct = (XFS_LBSIZE(mp) * 37) / 100; + mp->m_dirblksize = mp->m_sb.sb_blocksize; + mp->m_dirblkfsbs = 1; +} + +/* + * Return 1 if directory contains only "." and "..". + */ +static int +xfs_dir_isempty(xfs_inode_t *dp) +{ + xfs_dir_sf_hdr_t *hdr; + + ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + if (dp->i_d.di_size == 0) + return(1); + if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp)) + return(0); + hdr = (xfs_dir_sf_hdr_t *)dp->i_df.if_u1.if_data; + return(hdr->count == 0); +} + +/* + * Initialize a directory with its "." and ".." entries. + */ +static int +xfs_dir_init(xfs_trans_t *trans, xfs_inode_t *dir, xfs_inode_t *parent_dir) +{ + xfs_da_args_t args; + int error; + + bzero((char *)&args, sizeof(args)); + args.dp = dir; + args.trans = trans; + + ASSERT((dir->i_d.di_mode & IFMT) == IFDIR); + if ((error = xfs_dir_ino_validate(trans->t_mountp, parent_dir->i_ino))) + return error; + + return(xfs_dir_shortform_create(&args, parent_dir->i_ino)); +} + +/* + * Generic handler routine to add a name to a directory. + * Transitions directory from shortform to Btree as necessary. + */ +static int /* error */ +xfs_dir_createname(xfs_trans_t *trans, xfs_inode_t *dp, char *name, + int namelen, xfs_ino_t inum, xfs_fsblock_t *firstblock, + xfs_bmap_free_t *flist, xfs_extlen_t total) +{ + xfs_da_args_t args; + int retval, newsize, done; + + ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + + if ((retval = xfs_dir_ino_validate(trans->t_mountp, inum))) + return (retval); + + XFS_STATS_INC(xfsstats.xs_dir_create); + /* + * Fill in the arg structure for this request. + */ + args.name = name; + args.namelen = namelen; + args.hashval = xfs_da_hashname(name, namelen); + args.inumber = inum; + args.dp = dp; + args.firstblock = firstblock; + args.flist = flist; + args.total = total; + args.whichfork = XFS_DATA_FORK; + args.trans = trans; + args.justcheck = 0; + args.addname = args.oknoent = 1; + + /* + * Decide on what work routines to call based on the inode size. + */ + done = 0; + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + newsize = XFS_DIR_SF_ENTSIZE_BYNAME(args.namelen); + if ((dp->i_d.di_size + newsize) <= XFS_IFORK_DSIZE(dp)) { + retval = xfs_dir_shortform_addname(&args); + done = 1; + } else { + if (total == 0) + return XFS_ERROR(ENOSPC); + retval = xfs_dir_shortform_to_leaf(&args); + done = retval != 0; + } + } + if (!done && xfs_bmap_one_block(dp, XFS_DATA_FORK)) { + retval = xfs_dir_leaf_addname(&args); + done = retval != ENOSPC; + if (!done) { + if (total == 0) + return XFS_ERROR(ENOSPC); + retval = xfs_dir_leaf_to_node(&args); + done = retval != 0; + } + } + if (!done) { + retval = xfs_dir_node_addname(&args); + } + return(retval); +} + +/* + * Generic handler routine to check if a name can be added to a directory, + * without adding any blocks to the directory. + */ +static int /* error */ +xfs_dir_canenter(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen) +{ + xfs_da_args_t args; + int retval, newsize; + + ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + /* + * Fill in the arg structure for this request. + */ + args.name = name; + args.namelen = namelen; + args.hashval = xfs_da_hashname(name, namelen); + args.inumber = 0; + args.dp = dp; + args.firstblock = NULL; + args.flist = NULL; + args.total = 0; + args.whichfork = XFS_DATA_FORK; + args.trans = trans; + args.justcheck = args.addname = args.oknoent = 1; + + /* + * Decide on what work routines to call based on the inode size. + */ + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + newsize = XFS_DIR_SF_ENTSIZE_BYNAME(args.namelen); + if ((dp->i_d.di_size + newsize) <= XFS_IFORK_DSIZE(dp)) + retval = 0; + else + retval = XFS_ERROR(ENOSPC); + } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) { + retval = xfs_dir_leaf_addname(&args); + } else { + retval = xfs_dir_node_addname(&args); + } + return(retval); +} + +/* + * Generic handler routine to remove a name from a directory. + * Transitions directory from Btree to shortform as necessary. + */ +static int /* error */ +xfs_dir_removename(xfs_trans_t *trans, xfs_inode_t *dp, char *name, + int namelen, xfs_ino_t ino, xfs_fsblock_t *firstblock, + xfs_bmap_free_t *flist, xfs_extlen_t total) +{ + xfs_da_args_t args; + int count, totallen, newsize, retval; + + ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + XFS_STATS_INC(xfsstats.xs_dir_remove); + /* + * Fill in the arg structure for this request. + */ + args.name = name; + args.namelen = namelen; + args.hashval = xfs_da_hashname(name, namelen); + args.inumber = ino; + args.dp = dp; + args.firstblock = firstblock; + args.flist = flist; + args.total = total; + args.whichfork = XFS_DATA_FORK; + args.trans = trans; + args.justcheck = args.addname = args.oknoent = 0; + + /* + * Decide on what work routines to call based on the inode size. + */ + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + retval = xfs_dir_shortform_removename(&args); + } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) { + retval = xfs_dir_leaf_removename(&args, &count, &totallen); + if (retval == 0) { + newsize = XFS_DIR_SF_ALLFIT(count, totallen); + if (newsize <= XFS_IFORK_DSIZE(dp)) { + retval = xfs_dir_leaf_to_shortform(&args); + } + } + } else { + retval = xfs_dir_node_removename(&args); + } + return(retval); +} + +static int /* error */ +xfs_dir_lookup(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen, + xfs_ino_t *inum) +{ + xfs_da_args_t args; + int retval; + + ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + if (namelen >= MAXNAMELEN) { + return(XFS_ERROR(EINVAL)); + } + + XFS_STATS_INC(xfsstats.xs_dir_lookup); + /* + * Fill in the arg structure for this request. + */ + args.name = name; + args.namelen = namelen; + args.hashval = xfs_da_hashname(name, namelen); + args.inumber = 0; + args.dp = dp; + args.firstblock = NULL; + args.flist = NULL; + args.total = 0; + args.whichfork = XFS_DATA_FORK; + args.trans = trans; + args.justcheck = args.addname = 0; + args.oknoent = 1; + + /* + * Decide on what work routines to call based on the inode size. + */ + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + retval = xfs_dir_shortform_lookup(&args); + } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) { + retval = xfs_dir_leaf_lookup(&args); + } else { + retval = xfs_dir_node_lookup(&args); + } + if (retval == EEXIST) + retval = 0; + *inum = args.inumber; + return(retval); +} + +/* + * Implement readdir. + */ +static int /* error */ +xfs_dir_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio, int *eofp) +{ + xfs_dirent_t *dbp; + int alignment, retval; + xfs_dir_put_t put; + + XFS_STATS_INC(xfsstats.xs_dir_getdents); + ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + + /* + * If our caller has given us a single contiguous memory buffer, + * just work directly within that buffer. If it's in user memory, + * lock it down first. + */ + alignment = sizeof(xfs_off_t) - 1; + if ((uio->uio_iovcnt == 1) && + (((__psint_t)uio->uio_iov[0].iov_base & alignment) == 0) && + ((uio->uio_iov[0].iov_len & alignment) == 0)) { + dbp = NULL; + put = xfs_dir_put_dirent64_direct; + } else { + dbp = kmem_alloc(sizeof(*dbp) + MAXNAMELEN, KM_SLEEP); + put = xfs_dir_put_dirent64_uio; + } + + /* + * Decide on what work routines to call based on the inode size. + */ + *eofp = 0; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + retval = xfs_dir_shortform_getdents(dp, uio, eofp, dbp, put); + } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) { + retval = xfs_dir_leaf_getdents(trans, dp, uio, eofp, dbp, put); + } else { + retval = xfs_dir_node_getdents(trans, dp, uio, eofp, dbp, put); + } + if (dbp != NULL) + kmem_free(dbp, sizeof(*dbp) + MAXNAMELEN); + + return(retval); +} + +static int /* error */ +xfs_dir_replace(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen, + xfs_ino_t inum, xfs_fsblock_t *firstblock, + xfs_bmap_free_t *flist, xfs_extlen_t total) +{ + xfs_da_args_t args; + int retval; + + ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + if (namelen >= MAXNAMELEN) { + return(XFS_ERROR(EINVAL)); + } + + if ((retval = xfs_dir_ino_validate(trans->t_mountp, inum))) + return retval; + + /* + * Fill in the arg structure for this request. + */ + args.name = name; + args.namelen = namelen; + args.hashval = xfs_da_hashname(name, namelen); + args.inumber = inum; + args.dp = dp; + args.firstblock = firstblock; + args.flist = flist; + args.total = total; + args.whichfork = XFS_DATA_FORK; + args.trans = trans; + args.justcheck = args.addname = args.oknoent = 0; + + /* + * Decide on what work routines to call based on the inode size. + */ + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + retval = xfs_dir_shortform_replace(&args); + } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) { + retval = xfs_dir_leaf_replace(&args); + } else { + retval = xfs_dir_node_replace(&args); + } + + return(retval); +} + +static int +xfs_dir_shortform_validate_ondisk(xfs_mount_t *mp, xfs_dinode_t *dp) +{ + xfs_ino_t ino; + int namelen_sum; + int count; + xfs_dir_shortform_t *sf; + xfs_dir_sf_entry_t *sfe; + int i; + + + + if ((INT_GET(dp->di_core.di_mode, ARCH_CONVERT) & IFMT) != IFDIR) { + return 0; + } + if (INT_GET(dp->di_core.di_format, ARCH_CONVERT) != XFS_DINODE_FMT_LOCAL) { + return 0; + } + if (INT_GET(dp->di_core.di_size, ARCH_CONVERT) < sizeof(sf->hdr)) { + xfs_fs_cmn_err(CE_WARN, mp, "Invalid shortform size: dp 0x%p\n", + dp); + return 1; + } + sf = (xfs_dir_shortform_t *)(&dp->di_u.di_dirsf); + ino = XFS_GET_DIR_INO_ARCH(mp, sf->hdr.parent, ARCH_CONVERT); + if (xfs_dir_ino_validate(mp, ino)) + return 1; + + count = sf->hdr.count; + if ((count < 0) || ((count * 10) > XFS_LITINO(mp))) { + xfs_fs_cmn_err(CE_WARN, mp, + "Invalid shortform count: dp 0x%p\n", dp); + return(1); + } + + if (count == 0) { + return 0; + } + + namelen_sum = 0; + sfe = &sf->list[0]; + for (i = sf->hdr.count - 1; i >= 0; i--) { + ino = XFS_GET_DIR_INO_ARCH(mp, sfe->inumber, ARCH_CONVERT); + xfs_dir_ino_validate(mp, ino); + if (sfe->namelen >= XFS_LITINO(mp)) { + xfs_fs_cmn_err(CE_WARN, mp, + "Invalid shortform namelen: dp 0x%p\n", dp); + return 1; + } + namelen_sum += sfe->namelen; + sfe = XFS_DIR_SF_NEXTENTRY(sfe); + } + if (namelen_sum >= XFS_LITINO(mp)) { + xfs_fs_cmn_err(CE_WARN, mp, + "Invalid shortform namelen: dp 0x%p\n", dp); + return 1; + } + + return 0; +} + +/*======================================================================== + * External routines when dirsize == XFS_LBSIZE(dp->i_mount). + *========================================================================*/ + +/* + * Add a name to the leaf directory structure + * This is the external routine. + */ +int +xfs_dir_leaf_addname(xfs_da_args_t *args) +{ + int index, retval; + xfs_dabuf_t *bp; + + retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, + XFS_DATA_FORK); + if (retval) + return(retval); + ASSERT(bp != NULL); + + retval = xfs_dir_leaf_lookup_int(bp, args, &index); + if (retval == ENOENT) + retval = xfs_dir_leaf_add(bp, args, index); + xfs_da_buf_done(bp); + return(retval); +} + +/* + * Remove a name from the leaf directory structure + * This is the external routine. + */ +STATIC int +xfs_dir_leaf_removename(xfs_da_args_t *args, int *count, int *totallen) +{ + xfs_dir_leafblock_t *leaf; + int index, retval; + xfs_dabuf_t *bp; + + retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, + XFS_DATA_FORK); + if (retval) + return(retval); + ASSERT(bp != NULL); + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC); + retval = xfs_dir_leaf_lookup_int(bp, args, &index); + if (retval == EEXIST) { + (void)xfs_dir_leaf_remove(args->trans, bp, index); + *count = INT_GET(leaf->hdr.count, ARCH_CONVERT); + *totallen = INT_GET(leaf->hdr.namebytes, ARCH_CONVERT); + retval = 0; + } + xfs_da_buf_done(bp); + return(retval); +} + +/* + * Look up a name in a leaf directory structure. + * This is the external routine. + */ +STATIC int +xfs_dir_leaf_lookup(xfs_da_args_t *args) +{ + int index, retval; + xfs_dabuf_t *bp; + + retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, + XFS_DATA_FORK); + if (retval) + return(retval); + ASSERT(bp != NULL); + retval = xfs_dir_leaf_lookup_int(bp, args, &index); + xfs_da_brelse(args->trans, bp); + return(retval); +} + +/* + * Copy out directory entries for getdents(), for leaf directories. + */ +STATIC int +xfs_dir_leaf_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio, + int *eofp, xfs_dirent_t *dbp, xfs_dir_put_t put) +{ + xfs_dabuf_t *bp; + int retval, eob; + + retval = xfs_da_read_buf(dp->i_transp, dp, 0, -1, &bp, XFS_DATA_FORK); + if (retval) + return(retval); + ASSERT(bp != NULL); + retval = xfs_dir_leaf_getdents_int(bp, dp, 0, uio, &eob, dbp, put, -1); + xfs_da_brelse(trans, bp); + *eofp = (eob == 0); + return(retval); +} + +/* + * Look up a name in a leaf directory structure, replace the inode number. + * This is the external routine. + */ +STATIC int +xfs_dir_leaf_replace(xfs_da_args_t *args) +{ + int index, retval; + xfs_dabuf_t *bp; + xfs_ino_t inum; + xfs_dir_leafblock_t *leaf; + xfs_dir_leaf_entry_t *entry; + xfs_dir_leaf_name_t *namest; + + inum = args->inumber; + retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, + XFS_DATA_FORK); + if (retval) + return(retval); + ASSERT(bp != NULL); + retval = xfs_dir_leaf_lookup_int(bp, args, &index); + if (retval == EEXIST) { + leaf = bp->data; + entry = &leaf->entries[index]; + namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT)); + /* XXX - replace assert? */ + XFS_DIR_SF_PUT_DIRINO_ARCH(&inum, &namest->inumber, ARCH_CONVERT); + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, namest, sizeof(namest->inumber))); + xfs_da_buf_done(bp); + retval = 0; + } else + xfs_da_brelse(args->trans, bp); + return(retval); +} + + +/*======================================================================== + * External routines when dirsize > XFS_LBSIZE(mp). + *========================================================================*/ + +/* + * Add a name to a Btree-format directory. + * + * This will involve walking down the Btree, and may involve splitting + * leaf nodes and even splitting intermediate nodes up to and including + * the root node (a special case of an intermediate node). + */ +STATIC int +xfs_dir_node_addname(xfs_da_args_t *args) +{ + xfs_da_state_t *state; + xfs_da_state_blk_t *blk; + int retval, error; + + /* + * Fill in bucket of arguments/results/context to carry around. + */ + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + state->blocksize = state->mp->m_sb.sb_blocksize; + + /* + * Search to see if name already exists, and get back a pointer + * to where it should go. + */ + error = xfs_da_node_lookup_int(state, &retval); + if (error) + retval = error; + if (retval != ENOENT) + goto error; + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC); + retval = xfs_dir_leaf_add(blk->bp, args, blk->index); + if (retval == 0) { + /* + * Addition succeeded, update Btree hashvals. + */ + if (!args->justcheck) + xfs_da_fixhashpath(state, &state->path); + } else { + /* + * Addition failed, split as many Btree elements as required. + */ + if (args->total == 0) { + ASSERT(retval == ENOSPC); + goto error; + } + retval = xfs_da_split(state); + } +error: + xfs_da_state_free(state); + + return(retval); +} + +/* + * Remove a name from a B-tree directory. + * + * This will involve walking down the Btree, and may involve joining + * leaf nodes and even joining intermediate nodes up to and including + * the root node (a special case of an intermediate node). + */ +STATIC int +xfs_dir_node_removename(xfs_da_args_t *args) +{ + xfs_da_state_t *state; + xfs_da_state_blk_t *blk; + int retval, error; + + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + state->blocksize = state->mp->m_sb.sb_blocksize; + + /* + * Search to see if name exists, and get back a pointer to it. + */ + error = xfs_da_node_lookup_int(state, &retval); + if (error) + retval = error; + if (retval != EEXIST) { + xfs_da_state_free(state); + return(retval); + } + + /* + * Remove the name and update the hashvals in the tree. + */ + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC); + retval = xfs_dir_leaf_remove(args->trans, blk->bp, blk->index); + xfs_da_fixhashpath(state, &state->path); + + /* + * Check to see if the tree needs to be collapsed. + */ + error = 0; + if (retval) { + error = xfs_da_join(state); + } + + xfs_da_state_free(state); + if (error) + return(error); + return(0); +} + +/* + * Look up a filename in a int directory. + * Use an internal routine to actually do all the work. + */ +STATIC int +xfs_dir_node_lookup(xfs_da_args_t *args) +{ + xfs_da_state_t *state; + int retval, error, i; + + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + state->blocksize = state->mp->m_sb.sb_blocksize; + + /* + * Search to see if name exists, + * and get back a pointer to it. + */ + error = xfs_da_node_lookup_int(state, &retval); + if (error) { + retval = error; + } + + /* + * If not in a transaction, we have to release all the buffers. + */ + for (i = 0; i < state->path.active; i++) { + xfs_da_brelse(args->trans, state->path.blk[i].bp); + state->path.blk[i].bp = NULL; + } + + xfs_da_state_free(state); + return(retval); +} + +STATIC int +xfs_dir_node_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio, + int *eofp, xfs_dirent_t *dbp, xfs_dir_put_t put) +{ + xfs_da_intnode_t *node; + xfs_da_node_entry_t *btree; + xfs_dir_leafblock_t *leaf = NULL; + xfs_dablk_t bno, nextbno; + xfs_dahash_t cookhash; + xfs_mount_t *mp; + int error, eob, i; + xfs_dabuf_t *bp; + xfs_daddr_t nextda; + + /* + * Pick up our context. + */ + mp = dp->i_mount; + bp = NULL; + bno = XFS_DA_COOKIE_BNO(mp, uio->uio_offset); + cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset); + + xfs_dir_trace_g_du("node: start", dp, uio); + + /* + * Re-find our place, even if we're confused about what our place is. + * + * First we check the block number from the magic cookie, it is a + * cache of where we ended last time. If we find a leaf block, and + * the starting hashval in that block is less than our desired + * hashval, then we run with it. + */ + if (bno > 0) { + error = xfs_da_read_buf(trans, dp, bno, -1, &bp, XFS_DATA_FORK); + if ((error != 0) && (error != EFSCORRUPTED)) + return(error); + if (bp) + leaf = bp->data; + if (bp && INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) != XFS_DIR_LEAF_MAGIC) { + xfs_dir_trace_g_dub("node: block not a leaf", + dp, uio, bno); + xfs_da_brelse(trans, bp); + bp = NULL; + } + if (bp && INT_GET(leaf->entries[0].hashval, ARCH_CONVERT) > cookhash) { + xfs_dir_trace_g_dub("node: leaf hash too large", + dp, uio, bno); + xfs_da_brelse(trans, bp); + bp = NULL; + } + if (bp && + cookhash > INT_GET(leaf->entries[INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT)) { + xfs_dir_trace_g_dub("node: leaf hash too small", + dp, uio, bno); + xfs_da_brelse(trans, bp); + bp = NULL; + } + } + + /* + * If we did not find a leaf block from the blockno in the cookie, + * or we there was no blockno in the cookie (eg: first time thru), + * the we start at the top of the Btree and re-find our hashval. + */ + if (bp == NULL) { + xfs_dir_trace_g_du("node: start at root" , dp, uio); + bno = 0; + for (;;) { + error = xfs_da_read_buf(trans, dp, bno, -1, &bp, + XFS_DATA_FORK); + if (error) + return(error); + if (bp == NULL) + return(XFS_ERROR(EFSCORRUPTED)); + node = bp->data; + if (INT_GET(node->hdr.info.magic, ARCH_CONVERT) != XFS_DA_NODE_MAGIC) + break; + btree = &node->btree[0]; + xfs_dir_trace_g_dun("node: node detail", dp, uio, node); + for (i = 0; i < INT_GET(node->hdr.count, ARCH_CONVERT); btree++, i++) { + if (INT_GET(btree->hashval, ARCH_CONVERT) >= cookhash) { + bno = INT_GET(btree->before, ARCH_CONVERT); + break; + } + } + if (i == INT_GET(node->hdr.count, ARCH_CONVERT)) { + xfs_da_brelse(trans, bp); + xfs_dir_trace_g_du("node: hash beyond EOF", + dp, uio); + uio->uio_offset = XFS_DA_MAKE_COOKIE(mp, 0, 0, + XFS_DA_MAXHASH); + *eofp = 1; + return(0); + } + xfs_dir_trace_g_dub("node: going to block", + dp, uio, bno); + xfs_da_brelse(trans, bp); + } + } + ASSERT(cookhash != XFS_DA_MAXHASH); + + /* + * We've dropped down to the (first) leaf block that contains the + * hashval we are interested in. Continue rolling upward thru the + * leaf blocks until we fill up our buffer. + */ + for (;;) { + leaf = bp->data; + if (INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) != XFS_DIR_LEAF_MAGIC) { + xfs_dir_trace_g_dul("node: not a leaf", dp, uio, leaf); + xfs_da_brelse(trans, bp); + return XFS_ERROR(EFSCORRUPTED); + } + xfs_dir_trace_g_dul("node: leaf detail", dp, uio, leaf); + if ((nextbno = INT_GET(leaf->hdr.info.forw, ARCH_CONVERT))) { + nextda = xfs_da_reada_buf(trans, dp, nextbno, + XFS_DATA_FORK); + } else + nextda = -1; + error = xfs_dir_leaf_getdents_int(bp, dp, bno, uio, &eob, dbp, + put, nextda); + xfs_da_brelse(trans, bp); + bno = nextbno; + if (eob) { + xfs_dir_trace_g_dub("node: E-O-B", dp, uio, bno); + *eofp = 0; + return(error); + } + if (bno == 0) + break; + error = xfs_da_read_buf(trans, dp, bno, nextda, &bp, + XFS_DATA_FORK); + if (error) + return(error); + if (bp == NULL) + return(XFS_ERROR(EFSCORRUPTED)); + } + *eofp = 1; + xfs_dir_trace_g_du("node: E-O-F", dp, uio); + return(0); +} + +/* + * Look up a filename in an int directory, replace the inode number. + * Use an internal routine to actually do the lookup. + */ +STATIC int +xfs_dir_node_replace(xfs_da_args_t *args) +{ + xfs_da_state_t *state; + xfs_da_state_blk_t *blk; + xfs_dir_leafblock_t *leaf; + xfs_dir_leaf_entry_t *entry; + xfs_dir_leaf_name_t *namest; + xfs_ino_t inum; + int retval, error, i; + xfs_dabuf_t *bp; + + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + state->blocksize = state->mp->m_sb.sb_blocksize; + inum = args->inumber; + + /* + * Search to see if name exists, + * and get back a pointer to it. + */ + error = xfs_da_node_lookup_int(state, &retval); + if (error) { + retval = error; + } + + if (retval == EEXIST) { + blk = &state->path.blk[state->path.active - 1]; + ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC); + bp = blk->bp; + leaf = bp->data; + entry = &leaf->entries[blk->index]; + namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT)); + /* XXX - replace assert ? */ + XFS_DIR_SF_PUT_DIRINO_ARCH(&inum, &namest->inumber, ARCH_CONVERT); + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, namest, sizeof(namest->inumber))); + xfs_da_buf_done(bp); + blk->bp = NULL; + retval = 0; + } else { + i = state->path.active - 1; + xfs_da_brelse(args->trans, state->path.blk[i].bp); + state->path.blk[i].bp = NULL; + } + for (i = 0; i < state->path.active - 1; i++) { + xfs_da_brelse(args->trans, state->path.blk[i].bp); + state->path.blk[i].bp = NULL; + } + + xfs_da_state_free(state); + return(retval); +} + +#if defined(XFS_DIR_TRACE) +/* + * Add a trace buffer entry for an inode and a uio. + */ +void +xfs_dir_trace_g_du(char *where, xfs_inode_t *dp, uio_t *uio) +{ + xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DU, where, + (__psunsigned_t)dp, (__psunsigned_t)dp->i_mount, + (__psunsigned_t)(uio->uio_offset >> 32), + (__psunsigned_t)(uio->uio_offset & 0xFFFFFFFF), + (__psunsigned_t)uio->uio_resid, + NULL, NULL, NULL, NULL, NULL, NULL, NULL); +} + +/* + * Add a trace buffer entry for an inode and a uio. + */ +void +xfs_dir_trace_g_dub(char *where, xfs_inode_t *dp, uio_t *uio, xfs_dablk_t bno) +{ + xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUB, where, + (__psunsigned_t)dp, (__psunsigned_t)dp->i_mount, + (__psunsigned_t)(uio->uio_offset >> 32), + (__psunsigned_t)(uio->uio_offset & 0xFFFFFFFF), + (__psunsigned_t)uio->uio_resid, + (__psunsigned_t)bno, + NULL, NULL, NULL, NULL, NULL, NULL); +} + +/* + * Add a trace buffer entry for an inode and a uio. + */ +void +xfs_dir_trace_g_dun(char *where, xfs_inode_t *dp, uio_t *uio, + xfs_da_intnode_t *node) +{ + xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUN, where, + (__psunsigned_t)dp, (__psunsigned_t)dp->i_mount, + (__psunsigned_t)(uio->uio_offset >> 32), + (__psunsigned_t)(uio->uio_offset & 0xFFFFFFFF), + (__psunsigned_t)uio->uio_resid, + (__psunsigned_t)INT_GET(node->hdr.info.forw, ARCH_CONVERT), + (__psunsigned_t)INT_GET(node->hdr.count, ARCH_CONVERT), + (__psunsigned_t)INT_GET(node->btree[0].hashval, ARCH_CONVERT), + (__psunsigned_t)INT_GET(node->btree[INT_GET(node->hdr.count, ARCH_CONVERT)-1].hashval, ARCH_CONVERT), + NULL, NULL, NULL); +} + +/* + * Add a trace buffer entry for an inode and a uio. + */ +void +xfs_dir_trace_g_dul(char *where, xfs_inode_t *dp, uio_t *uio, + xfs_dir_leafblock_t *leaf) +{ + xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUL, where, + (__psunsigned_t)dp, (__psunsigned_t)dp->i_mount, + (__psunsigned_t)(uio->uio_offset >> 32), + (__psunsigned_t)(uio->uio_offset & 0xFFFFFFFF), + (__psunsigned_t)uio->uio_resid, + (__psunsigned_t)INT_GET(leaf->hdr.info.forw, ARCH_CONVERT), + (__psunsigned_t)INT_GET(leaf->hdr.count, ARCH_CONVERT), + (__psunsigned_t)INT_GET(leaf->entries[0].hashval, ARCH_CONVERT), + (__psunsigned_t)INT_GET(leaf->entries[ INT_GET(leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT), + NULL, NULL, NULL); +} + +/* + * Add a trace buffer entry for an inode and a uio. + */ +void +xfs_dir_trace_g_due(char *where, xfs_inode_t *dp, uio_t *uio, + xfs_dir_leaf_entry_t *entry) +{ + xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUE, where, + (__psunsigned_t)dp, (__psunsigned_t)dp->i_mount, + (__psunsigned_t)(uio->uio_offset >> 32), + (__psunsigned_t)(uio->uio_offset & 0xFFFFFFFF), + (__psunsigned_t)uio->uio_resid, + (__psunsigned_t)INT_GET(entry->hashval, ARCH_CONVERT), + NULL, NULL, NULL, NULL, NULL, NULL); +} + +/* + * Add a trace buffer entry for an inode and a uio. + */ +void +xfs_dir_trace_g_duc(char *where, xfs_inode_t *dp, uio_t *uio, xfs_off_t cookie) +{ + xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUC, where, + (__psunsigned_t)dp, (__psunsigned_t)dp->i_mount, + (__psunsigned_t)(uio->uio_offset >> 32), + (__psunsigned_t)(uio->uio_offset & 0xFFFFFFFF), + (__psunsigned_t)uio->uio_resid, + (__psunsigned_t)(cookie >> 32), + (__psunsigned_t)(cookie & 0xFFFFFFFF), + NULL, NULL, NULL, NULL, NULL); +} + +/* + * Add a trace buffer entry for the arguments given to the routine, + * generic form. + */ +void +xfs_dir_trace_enter(int type, char *where, + __psunsigned_t a0, __psunsigned_t a1, + __psunsigned_t a2, __psunsigned_t a3, + __psunsigned_t a4, __psunsigned_t a5, + __psunsigned_t a6, __psunsigned_t a7, + __psunsigned_t a8, __psunsigned_t a9, + __psunsigned_t a10, __psunsigned_t a11) +{ + ASSERT(xfs_dir_trace_buf); + ktrace_enter(xfs_dir_trace_buf, (void *)((__psunsigned_t)type), + (void *)where, + (void *)a0, (void *)a1, (void *)a2, + (void *)a3, (void *)a4, (void *)a5, + (void *)a6, (void *)a7, (void *)a8, + (void *)a9, (void *)a10, (void *)a11, + NULL, NULL); +} +#endif /* XFS_DIR_TRACE */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_dir.h linux-2.4-xfs/fs/xfs/xfs_dir.h --- linux-2.4.19/fs/xfs/xfs_dir.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_dir.h Wed Jul 10 23:14:00 2002 @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_DIR_H__ +#define __XFS_DIR_H__ + +/* + * Large directories are structured around Btrees where all the data + * elements are in the leaf nodes. Filenames are hashed into an int, + * then that int is used as the index into the Btree. Since the hashval + * of a filename may not be unique, we may have duplicate keys. The + * internal links in the Btree are logical block offsets into the file. + * + * Small directories use a different format and are packed as tightly + * as possible so as to fit into the literal area of the inode. + */ + +#ifdef XFS_ALL_TRACE +#define XFS_DIR_TRACE +#endif + +#if !defined(DEBUG) +#undef XFS_DIR_TRACE +#endif + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +struct uio; +struct xfs_bmap_free; +struct xfs_da_args; +struct xfs_dinode; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; + +/* + * Directory function types. + * Put in structures (xfs_dirops_t) for v1 and v2 directories. + */ +typedef void (*xfs_dir_mount_t)(struct xfs_mount *mp); +typedef int (*xfs_dir_isempty_t)(struct xfs_inode *dp); +typedef int (*xfs_dir_init_t)(struct xfs_trans *tp, + struct xfs_inode *dp, + struct xfs_inode *pdp); +typedef int (*xfs_dir_createname_t)(struct xfs_trans *tp, + struct xfs_inode *dp, + char *name, + int namelen, + xfs_ino_t inum, + xfs_fsblock_t *first, + struct xfs_bmap_free *flist, + xfs_extlen_t total); +typedef int (*xfs_dir_lookup_t)(struct xfs_trans *tp, + struct xfs_inode *dp, + char *name, + int namelen, + xfs_ino_t *inum); +typedef int (*xfs_dir_removename_t)(struct xfs_trans *tp, + struct xfs_inode *dp, + char *name, + int namelen, + xfs_ino_t ino, + xfs_fsblock_t *first, + struct xfs_bmap_free *flist, + xfs_extlen_t total); +typedef int (*xfs_dir_getdents_t)(struct xfs_trans *tp, + struct xfs_inode *dp, + struct uio *uio, + int *eofp); +typedef int (*xfs_dir_replace_t)(struct xfs_trans *tp, + struct xfs_inode *dp, + char *name, + int namelen, + xfs_ino_t inum, + xfs_fsblock_t *first, + struct xfs_bmap_free *flist, + xfs_extlen_t total); +typedef int (*xfs_dir_canenter_t)(struct xfs_trans *tp, + struct xfs_inode *dp, + char *name, + int namelen); +typedef int (*xfs_dir_shortform_validate_ondisk_t)(struct xfs_mount *mp, + struct xfs_dinode *dip); +typedef int (*xfs_dir_shortform_to_single_t)(struct xfs_da_args *args); + +typedef struct xfs_dirops { + xfs_dir_mount_t xd_mount; + xfs_dir_isempty_t xd_isempty; + xfs_dir_init_t xd_init; + xfs_dir_createname_t xd_createname; + xfs_dir_lookup_t xd_lookup; + xfs_dir_removename_t xd_removename; + xfs_dir_getdents_t xd_getdents; + xfs_dir_replace_t xd_replace; + xfs_dir_canenter_t xd_canenter; + xfs_dir_shortform_validate_ondisk_t xd_shortform_validate_ondisk; + xfs_dir_shortform_to_single_t xd_shortform_to_single; +} xfs_dirops_t; + +/* + * Overall external interface routines. + */ +void xfs_dir_startup(void); /* called exactly once */ + +#define XFS_DIR_MOUNT(mp) \ + ((mp)->m_dirops.xd_mount(mp)) +#define XFS_DIR_ISEMPTY(mp,dp) \ + ((mp)->m_dirops.xd_isempty(dp)) +#define XFS_DIR_INIT(mp,tp,dp,pdp) \ + ((mp)->m_dirops.xd_init(tp,dp,pdp)) +#define XFS_DIR_CREATENAME(mp,tp,dp,name,namelen,inum,first,flist,total) \ + ((mp)->m_dirops.xd_createname(tp,dp,name,namelen,inum,first,flist,\ + total)) +#define XFS_DIR_LOOKUP(mp,tp,dp,name,namelen,inum) \ + ((mp)->m_dirops.xd_lookup(tp,dp,name,namelen,inum)) +#define XFS_DIR_REMOVENAME(mp,tp,dp,name,namelen,ino,first,flist,total) \ + ((mp)->m_dirops.xd_removename(tp,dp,name,namelen,ino,first,flist,total)) +#define XFS_DIR_GETDENTS(mp,tp,dp,uio,eofp) \ + ((mp)->m_dirops.xd_getdents(tp,dp,uio,eofp)) +#define XFS_DIR_REPLACE(mp,tp,dp,name,namelen,inum,first,flist,total) \ + ((mp)->m_dirops.xd_replace(tp,dp,name,namelen,inum,first,flist,total)) +#define XFS_DIR_CANENTER(mp,tp,dp,name,namelen) \ + ((mp)->m_dirops.xd_canenter(tp,dp,name,namelen)) +#define XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp,dip) \ + ((mp)->m_dirops.xd_shortform_validate_ondisk(mp,dip)) +#define XFS_DIR_SHORTFORM_TO_SINGLE(mp,args) \ + ((mp)->m_dirops.xd_shortform_to_single(args)) + +#define XFS_DIR_IS_V1(mp) ((mp)->m_dirversion == 1) +extern xfs_dirops_t xfsv1_dirops; + +#endif /* __XFS_DIR_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_dir2.c linux-2.4-xfs/fs/xfs/xfs_dir2.c --- linux-2.4.19/fs/xfs/xfs_dir2.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_dir2.c Thu Jul 18 21:52:58 2002 @@ -0,0 +1,830 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * XFS v2 directory implmentation. + * Top-level and utility routines. + */ + +#include + +/* + * Declarations for interface routines. + */ +static void xfs_dir2_mount(xfs_mount_t *mp); +static int xfs_dir2_isempty(xfs_inode_t *dp); +static int xfs_dir2_init(xfs_trans_t *tp, xfs_inode_t *dp, + xfs_inode_t *pdp); +static int xfs_dir2_createname(xfs_trans_t *tp, xfs_inode_t *dp, + char *name, int namelen, xfs_ino_t inum, + xfs_fsblock_t *first, + xfs_bmap_free_t *flist, xfs_extlen_t total); +static int xfs_dir2_lookup(xfs_trans_t *tp, xfs_inode_t *dp, char *name, + int namelen, xfs_ino_t *inum); +static int xfs_dir2_removename(xfs_trans_t *tp, xfs_inode_t *dp, + char *name, int namelen, xfs_ino_t ino, + xfs_fsblock_t *first, + xfs_bmap_free_t *flist, xfs_extlen_t total); +static int xfs_dir2_getdents(xfs_trans_t *tp, xfs_inode_t *dp, uio_t *uio, + int *eofp); +static int xfs_dir2_replace(xfs_trans_t *tp, xfs_inode_t *dp, char *name, + int namelen, xfs_ino_t inum, + xfs_fsblock_t *first, xfs_bmap_free_t *flist, + xfs_extlen_t total); +static int xfs_dir2_canenter(xfs_trans_t *tp, xfs_inode_t *dp, char *name, + int namelen); +static int xfs_dir2_shortform_validate_ondisk(xfs_mount_t *mp, + xfs_dinode_t *dip); + +/* + * Utility routine declarations. + */ +static int xfs_dir2_put_dirent64_direct(xfs_dir2_put_args_t *pa); +static int xfs_dir2_put_dirent64_uio(xfs_dir2_put_args_t *pa); + +/* + * Directory operations vector. + */ +xfs_dirops_t xfsv2_dirops = { + .xd_mount = xfs_dir2_mount, + .xd_isempty = xfs_dir2_isempty, + .xd_init = xfs_dir2_init, + .xd_createname = xfs_dir2_createname, + .xd_lookup = xfs_dir2_lookup, + .xd_removename = xfs_dir2_removename, + .xd_getdents = xfs_dir2_getdents, + .xd_replace = xfs_dir2_replace, + .xd_canenter = xfs_dir2_canenter, + .xd_shortform_validate_ondisk = xfs_dir2_shortform_validate_ondisk, + .xd_shortform_to_single = xfs_dir2_sf_to_block, +}; + +/* + * Interface routines. + */ + +/* + * Initialize directory-related fields in the mount structure. + */ +static void +xfs_dir2_mount( + xfs_mount_t *mp) /* filesystem mount point */ +{ + mp->m_dirversion = 2; + ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <= + XFS_MAX_BLOCKSIZE); + mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog); + mp->m_dirblkfsbs = 1 << mp->m_sb.sb_dirblklog; + mp->m_dirdatablk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_DATA_FIRSTDB(mp)); + mp->m_dirleafblk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_LEAF_FIRSTDB(mp)); + mp->m_dirfreeblk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_FREE_FIRSTDB(mp)); + mp->m_da_node_ents = + (mp->m_dirblksize - (uint)sizeof(xfs_da_node_hdr_t)) / + (uint)sizeof(xfs_da_node_entry_t); + mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100; +} + +/* + * Return 1 if directory contains only "." and "..". + */ +static int /* return code */ +xfs_dir2_isempty( + xfs_inode_t *dp) /* incore inode structure */ +{ + xfs_dir2_sf_t *sfp; /* shortform directory structure */ + + ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + /* + * Might happen during shutdown. + */ + if (dp->i_d.di_size == 0) { + return 1; + } + if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp)) + return 0; + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + return INT_ISZERO(sfp->hdr.count, ARCH_CONVERT); +} + +/* + * Initialize a directory with its "." and ".." entries. + */ +static int /* error */ +xfs_dir2_init( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *dp, /* incore directory inode */ + xfs_inode_t *pdp) /* incore parent directory inode */ +{ + xfs_da_args_t args; /* operation arguments */ + int error; /* error return value */ + + bzero((char *)&args, sizeof(args)); + args.dp = dp; + args.trans = tp; + ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino))) { + return error; + } + return xfs_dir2_sf_create(&args, pdp->i_ino); +} + +/* + Enter a name in a directory. + */ +static int /* error */ +xfs_dir2_createname( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *dp, /* incore directory inode */ + char *name, /* new entry name */ + int namelen, /* new entry name length */ + xfs_ino_t inum, /* new entry inode number */ + xfs_fsblock_t *first, /* bmap's firstblock */ + xfs_bmap_free_t *flist, /* bmap's freeblock list */ + xfs_extlen_t total) /* bmap's total block count */ +{ + xfs_da_args_t args; /* operation arguments */ + int rval; /* return value */ + int v; /* type-checking value */ + + ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) { + return rval; + } + XFS_STATS_INC(xfsstats.xs_dir_create); + /* + * Fill in the arg structure for this request. + */ + args.name = name; + args.namelen = namelen; + args.hashval = xfs_da_hashname(name, namelen); + args.inumber = inum; + args.dp = dp; + args.firstblock = first; + args.flist = flist; + args.total = total; + args.whichfork = XFS_DATA_FORK; + args.trans = tp; + args.justcheck = 0; + args.addname = args.oknoent = 1; + /* + * Decide on what work routines to call based on the inode size. + */ + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + rval = xfs_dir2_sf_addname(&args); + else if ((rval = xfs_dir2_isblock(tp, dp, &v))) { + return rval; + } else if (v) + rval = xfs_dir2_block_addname(&args); + else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) { + return rval; + } else if (v) + rval = xfs_dir2_leaf_addname(&args); + else + rval = xfs_dir2_node_addname(&args); + return rval; +} + +/* + * Lookup a name in a directory, give back the inode number. + */ +static int /* error */ +xfs_dir2_lookup( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *dp, /* incore directory inode */ + char *name, /* lookup name */ + int namelen, /* lookup name length */ + xfs_ino_t *inum) /* out: inode number */ +{ + xfs_da_args_t args; /* operation arguments */ + int rval; /* return value */ + int v; /* type-checking value */ + + ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + if (namelen >= MAXNAMELEN) { + return XFS_ERROR(EINVAL); + } + XFS_STATS_INC(xfsstats.xs_dir_lookup); + /* + * Fill in the arg structure for this request. + */ + args.name = name; + args.namelen = namelen; + args.hashval = xfs_da_hashname(name, namelen); + args.inumber = 0; + args.dp = dp; + args.firstblock = NULL; + args.flist = NULL; + args.total = 0; + args.whichfork = XFS_DATA_FORK; + args.trans = tp; + args.justcheck = args.addname = 0; + args.oknoent = 1; + /* + * Decide on what work routines to call based on the inode size. + */ + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + rval = xfs_dir2_sf_lookup(&args); + else if ((rval = xfs_dir2_isblock(tp, dp, &v))) { + return rval; + } else if (v) + rval = xfs_dir2_block_lookup(&args); + else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) { + return rval; + } else if (v) + rval = xfs_dir2_leaf_lookup(&args); + else + rval = xfs_dir2_node_lookup(&args); + if (rval == EEXIST) + rval = 0; + if (rval == 0) + *inum = args.inumber; + return rval; +} + +/* + * Remove an entry from a directory. + */ +static int /* error */ +xfs_dir2_removename( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *dp, /* incore directory inode */ + char *name, /* name of entry to remove */ + int namelen, /* name length of entry to remove */ + xfs_ino_t ino, /* inode number of entry to remove */ + xfs_fsblock_t *first, /* bmap's firstblock */ + xfs_bmap_free_t *flist, /* bmap's freeblock list */ + xfs_extlen_t total) /* bmap's total block count */ +{ + xfs_da_args_t args; /* operation arguments */ + int rval; /* return value */ + int v; /* type-checking value */ + + ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + XFS_STATS_INC(xfsstats.xs_dir_remove); + /* + * Fill in the arg structure for this request. + */ + args.name = name; + args.namelen = namelen; + args.hashval = xfs_da_hashname(name, namelen); + args.inumber = ino; + args.dp = dp; + args.firstblock = first; + args.flist = flist; + args.total = total; + args.whichfork = XFS_DATA_FORK; + args.trans = tp; + args.justcheck = args.addname = args.oknoent = 0; + /* + * Decide on what work routines to call based on the inode size. + */ + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + rval = xfs_dir2_sf_removename(&args); + else if ((rval = xfs_dir2_isblock(tp, dp, &v))) { + return rval; + } else if (v) + rval = xfs_dir2_block_removename(&args); + else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) { + return rval; + } else if (v) + rval = xfs_dir2_leaf_removename(&args); + else + rval = xfs_dir2_node_removename(&args); + return rval; +} + +/* + * Read a directory. + */ +static int /* error */ +xfs_dir2_getdents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *dp, /* incore directory inode */ + uio_t *uio, /* caller's buffer control */ + int *eofp) /* out: eof reached */ +{ + int alignment; /* alignment required for ABI */ + xfs_dirent_t *dbp; /* malloc'ed buffer */ + xfs_dir2_put_t put; /* entry formatting routine */ + int rval; /* return value */ + int v; /* type-checking value */ + + ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + XFS_STATS_INC(xfsstats.xs_dir_getdents); + /* + * If our caller has given us a single contiguous aligned memory buffer, + * just work directly within that buffer. If it's in user memory, + * lock it down first. + */ + alignment = sizeof(xfs_off_t) - 1; + if ((uio->uio_iovcnt == 1) && + (((__psint_t)uio->uio_iov[0].iov_base & alignment) == 0) && + ((uio->uio_iov[0].iov_len & alignment) == 0)) { + dbp = NULL; + put = xfs_dir2_put_dirent64_direct; + } else { + dbp = kmem_alloc(sizeof(*dbp) + MAXNAMELEN, KM_SLEEP); + put = xfs_dir2_put_dirent64_uio; + } + + *eofp = 0; + /* + * Decide on what work routines to call based on the inode size. + */ + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + rval = xfs_dir2_sf_getdents(dp, uio, eofp, dbp, put); + else if ((rval = xfs_dir2_isblock(tp, dp, &v))) { + ; + } else if (v) + rval = xfs_dir2_block_getdents(tp, dp, uio, eofp, dbp, put); + else + rval = xfs_dir2_leaf_getdents(tp, dp, uio, eofp, dbp, put); + if (dbp != NULL) + kmem_free(dbp, sizeof(*dbp) + MAXNAMELEN); + return rval; +} + +/* + * Replace the inode number of a directory entry. + */ +static int /* error */ +xfs_dir2_replace( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *dp, /* incore directory inode */ + char *name, /* name of entry to replace */ + int namelen, /* name length of entry to replace */ + xfs_ino_t inum, /* new inode number */ + xfs_fsblock_t *first, /* bmap's firstblock */ + xfs_bmap_free_t *flist, /* bmap's freeblock list */ + xfs_extlen_t total) /* bmap's total block count */ +{ + xfs_da_args_t args; /* operation arguments */ + int rval; /* return value */ + int v; /* type-checking value */ + + ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + if (namelen >= MAXNAMELEN) { + return XFS_ERROR(EINVAL); + } + if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) { + return rval; + } + /* + * Fill in the arg structure for this request. + */ + args.name = name; + args.namelen = namelen; + args.hashval = xfs_da_hashname(name, namelen); + args.inumber = inum; + args.dp = dp; + args.firstblock = first; + args.flist = flist; + args.total = total; + args.whichfork = XFS_DATA_FORK; + args.trans = tp; + args.justcheck = args.addname = args.oknoent = 0; + /* + * Decide on what work routines to call based on the inode size. + */ + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + rval = xfs_dir2_sf_replace(&args); + else if ((rval = xfs_dir2_isblock(tp, dp, &v))) { + return rval; + } else if (v) + rval = xfs_dir2_block_replace(&args); + else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) { + return rval; + } else if (v) + rval = xfs_dir2_leaf_replace(&args); + else + rval = xfs_dir2_node_replace(&args); + return rval; +} + +/* + * See if this entry can be added to the directory without allocating space. + */ +static int /* error */ +xfs_dir2_canenter( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *dp, /* incore directory inode */ + char *name, /* name of entry to add */ + int namelen) /* name length of entry to add */ +{ + xfs_da_args_t args; /* operation arguments */ + int rval; /* return value */ + int v; /* type-checking value */ + + ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + /* + * Fill in the arg structure for this request. + */ + args.name = name; + args.namelen = namelen; + args.hashval = xfs_da_hashname(name, namelen); + args.inumber = 0; + args.dp = dp; + args.firstblock = NULL; + args.flist = NULL; + args.total = 0; + args.whichfork = XFS_DATA_FORK; + args.trans = tp; + args.justcheck = args.addname = args.oknoent = 1; + /* + * Decide on what work routines to call based on the inode size. + */ + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + rval = xfs_dir2_sf_addname(&args); + else if ((rval = xfs_dir2_isblock(tp, dp, &v))) { + return rval; + } else if (v) + rval = xfs_dir2_block_addname(&args); + else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) { + return rval; + } else if (v) + rval = xfs_dir2_leaf_addname(&args); + else + rval = xfs_dir2_node_addname(&args); + return rval; +} + +/* + * Dummy routine for shortform inode validation. + * Can't really do this. + */ +/* ARGSUSED */ +static int /* error */ +xfs_dir2_shortform_validate_ondisk( + xfs_mount_t *mp, /* filesystem mount point */ + xfs_dinode_t *dip) /* ondisk inode */ +{ + return 0; +} + +/* + * Utility routines. + */ + +/* + * Add a block to the directory. + * This routine is for data and free blocks, not leaf/node blocks + * which are handled by xfs_da_grow_inode. + */ +int /* error */ +xfs_dir2_grow_inode( + xfs_da_args_t *args, /* operation arguments */ + int space, /* v2 dir's space XFS_DIR2_xxx_SPACE */ + xfs_dir2_db_t *dbp) /* out: block number added */ +{ + xfs_fileoff_t bno; /* directory offset of new block */ + int count; /* count of filesystem blocks */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + int got; /* blocks actually mapped */ + int i; /* temp mapping index */ + xfs_bmbt_irec_t map; /* single structure for bmap */ + int mapi; /* mapping index */ + xfs_bmbt_irec_t *mapp; /* bmap mapping structure(s) */ + xfs_mount_t *mp; /* filesystem mount point */ + int nmap; /* number of bmap entries */ + xfs_trans_t *tp; /* transaction pointer */ + + xfs_dir2_trace_args_s("grow_inode", args, space); + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + /* + * Set lowest possible block in the space requested. + */ + bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE); + count = mp->m_dirblkfsbs; + /* + * Find the first hole for our block. + */ + if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, XFS_DATA_FORK))) { + return error; + } + nmap = 1; + ASSERT(args->firstblock != NULL); + /* + * Try mapping the new block contiguously (one extent). + */ + if ((error = xfs_bmapi(tp, dp, bno, count, + XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG, + args->firstblock, args->total, &map, &nmap, + args->flist))) { + return error; + } + ASSERT(nmap <= 1); + /* + * Got it in 1. + */ + if (nmap == 1) { + mapp = ↦ + mapi = 1; + } + /* + * Didn't work and this is a multiple-fsb directory block. + * Try again with contiguous flag turned on. + */ + else if (nmap == 0 && count > 1) { + xfs_fileoff_t b; /* current file offset */ + + /* + * Space for maximum number of mappings. + */ + mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP); + /* + * Iterate until we get to the end of our block. + */ + for (b = bno, mapi = 0; b < bno + count; ) { + int c; /* current fsb count */ + + /* + * Can't map more than MAX_NMAP at once. + */ + nmap = MIN(XFS_BMAP_MAX_NMAP, count); + c = (int)(bno + count - b); + if ((error = xfs_bmapi(tp, dp, b, c, + XFS_BMAPI_WRITE|XFS_BMAPI_METADATA, + args->firstblock, args->total, + &mapp[mapi], &nmap, args->flist))) { + kmem_free(mapp, sizeof(*mapp) * count); + return error; + } + if (nmap < 1) + break; + /* + * Add this bunch into our table, go to the next offset. + */ + mapi += nmap; + b = mapp[mapi - 1].br_startoff + + mapp[mapi - 1].br_blockcount; + } + } + /* + * Didn't work. + */ + else { + mapi = 0; + mapp = NULL; + } + /* + * See how many fsb's we got. + */ + for (i = 0, got = 0; i < mapi; i++) + got += mapp[i].br_blockcount; + /* + * Didn't get enough fsb's, or the first/last block's are wrong. + */ + if (got != count || mapp[0].br_startoff != bno || + mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount != + bno + count) { + if (mapp != &map) + kmem_free(mapp, sizeof(*mapp) * count); + return XFS_ERROR(ENOSPC); + } + /* + * Done with the temporary mapping table. + */ + if (mapp != &map) + kmem_free(mapp, sizeof(*mapp) * count); + *dbp = XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)bno); + /* + * Update file's size if this is the data space and it grew. + */ + if (space == XFS_DIR2_DATA_SPACE) { + xfs_fsize_t size; /* directory file (data) size */ + + size = XFS_FSB_TO_B(mp, bno + count); + if (size > dp->i_d.di_size) { + dp->i_d.di_size = size; + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + } + } + return 0; +} + +/* + * See if the directory is a single-block form directory. + */ +int /* error */ +xfs_dir2_isblock( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *dp, /* incore directory inode */ + int *vp) /* out: 1 is block, 0 is not block */ +{ + xfs_fileoff_t last; /* last file offset */ + xfs_mount_t *mp; /* filesystem mount point */ + int rval; /* return value */ + + mp = dp->i_mount; + if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) { + return rval; + } + rval = XFS_FSB_TO_B(mp, last) == mp->m_dirblksize; + ASSERT(rval == 0 || dp->i_d.di_size == mp->m_dirblksize); + *vp = rval; + return 0; +} + +/* + * See if the directory is a single-leaf form directory. + */ +int /* error */ +xfs_dir2_isleaf( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *dp, /* incore directory inode */ + int *vp) /* out: 1 is leaf, 0 is not leaf */ +{ + xfs_fileoff_t last; /* last file offset */ + xfs_mount_t *mp; /* filesystem mount point */ + int rval; /* return value */ + + mp = dp->i_mount; + if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) { + return rval; + } + *vp = last == mp->m_dirleafblk + (1 << mp->m_sb.sb_dirblklog); + return 0; +} + +/* + * Getdents put routine for 64-bit ABI, direct form. + */ +static int /* error */ +xfs_dir2_put_dirent64_direct( + xfs_dir2_put_args_t *pa) /* argument bundle */ +{ + xfs_dirent_t *idbp; /* dirent pointer */ + iovec_t *iovp; /* io vector */ + int namelen; /* entry name length */ + int reclen; /* entry total length */ + uio_t *uio; /* I/O control */ + + namelen = pa->namelen; + reclen = DIRENTSIZE(namelen); + uio = pa->uio; + /* + * Won't fit in the remaining space. + */ + if (reclen > uio->uio_resid) { + pa->done = 0; + return 0; + } + iovp = uio->uio_iov; + idbp = (xfs_dirent_t *)iovp->iov_base; + iovp->iov_base = (char *)idbp + reclen; + iovp->iov_len -= reclen; + uio->uio_resid -= reclen; + idbp->d_reclen = reclen; + idbp->d_ino = pa->ino; + idbp->d_off = pa->cook; + idbp->d_name[namelen] = '\0'; + pa->done = 1; + bcopy(pa->name, idbp->d_name, namelen); + return 0; +} + +/* + * Getdents put routine for 64-bit ABI, uio form. + */ +static int /* error */ +xfs_dir2_put_dirent64_uio( + xfs_dir2_put_args_t *pa) /* argument bundle */ +{ + xfs_dirent_t *idbp; /* dirent pointer */ + int namelen; /* entry name length */ + int reclen; /* entry total length */ + int rval; /* return value */ + uio_t *uio; /* I/O control */ + + namelen = pa->namelen; + reclen = DIRENTSIZE(namelen); + uio = pa->uio; + /* + * Won't fit in the remaining space. + */ + if (reclen > uio->uio_resid) { + pa->done = 0; + return 0; + } + idbp = pa->dbp; + idbp->d_reclen = reclen; + idbp->d_ino = pa->ino; + idbp->d_off = pa->cook; + idbp->d_name[namelen] = '\0'; + bcopy(pa->name, idbp->d_name, namelen); + rval = uiomove((caddr_t)idbp, reclen, UIO_READ, uio); + pa->done = (rval == 0); + return rval; +} + +/* + * Remove the given block from the directory. + * This routine is used for data and free blocks, leaf/node are done + * by xfs_da_shrink_inode. + */ +int +xfs_dir2_shrink_inode( + xfs_da_args_t *args, /* operation arguments */ + xfs_dir2_db_t db, /* directory block number */ + xfs_dabuf_t *bp) /* block's buffer */ +{ + xfs_fileoff_t bno; /* directory file offset */ + xfs_dablk_t da; /* directory file offset */ + int done; /* bunmap is finished */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_trans_t *tp; /* transaction pointer */ + + xfs_dir2_trace_args_db("shrink_inode", args, db, bp); + dp = args->dp; + mp = dp->i_mount; + tp = args->trans; + da = XFS_DIR2_DB_TO_DA(mp, db); + /* + * Unmap the fsblock(s). + */ + if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs, + XFS_BMAPI_METADATA, 0, args->firstblock, args->flist, + &done))) { + /* + * ENOSPC actually can happen if we're in a removename with + * no space reservation, and the resulting block removal + * would cause a bmap btree split or conversion from extents + * to btree. This can only happen for un-fragmented + * directory blocks, since you need to be punching out + * the middle of an extent. + * In this case we need to leave the block in the file, + * and not binval it. + * So the block has to be in a consistent empty state + * and appropriately logged. + * We don't free up the buffer, the caller can tell it + * hasn't happened since it got an error back. + */ + return error; + } + ASSERT(done); + /* + * Invalidate the buffer from the transaction. + */ + xfs_da_binval(tp, bp); + /* + * If it's not a data block, we're done. + */ + if (db >= XFS_DIR2_LEAF_FIRSTDB(mp)) + return 0; + /* + * If the block isn't the last one in the directory, we're done. + */ + if (dp->i_d.di_size > XFS_DIR2_DB_OFF_TO_BYTE(mp, db + 1, 0)) + return 0; + bno = da; + if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) { + /* + * This can't really happen unless there's kernel corruption. + */ + return error; + } + if (db == mp->m_dirdatablk) + ASSERT(bno == 0); + else + ASSERT(bno > 0); + /* + * Set the size to the new last block. + */ + dp->i_d.di_size = XFS_FSB_TO_B(mp, bno); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + return 0; +} diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_dir2.h linux-2.4-xfs/fs/xfs/xfs_dir2.h --- linux-2.4.19/fs/xfs/xfs_dir2.h Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_dir2.h Wed Jul 10 23:14:00 2002 @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_DIR2_H__ +#define __XFS_DIR2_H__ + +struct dirent; +struct uio; +struct xfs_dabuf; +struct xfs_da_args; +struct xfs_dir2_put_args; +struct xfs_inode; +struct xfs_trans; + +/* + * Directory version 2. + * There are 4 possible formats: + * shortform + * single block - data with embedded leaf at the end + * multiple data blocks, single leaf+freeindex block + * data blocks, node&leaf blocks (btree), freeindex blocks + * + * The shortform format is in xfs_dir2_sf.h. + * The single block format is in xfs_dir2_block.h. + * The data block format is in xfs_dir2_data.h. + * The leaf and freeindex block formats are in xfs_dir2_leaf.h. + * Node blocks are the same as the other version, in xfs_da_btree.h. + */ + +/* + * Byte offset in data block and shortform entry. + */ +typedef __uint16_t xfs_dir2_data_off_t; +#define NULLDATAOFF 0xffffU +typedef uint xfs_dir2_data_aoff_t; /* argument form */ + +/* + * Directory block number (logical dirblk in file) + */ +typedef __uint32_t xfs_dir2_db_t; + +/* + * Byte offset in a directory. + */ +typedef xfs_off_t xfs_dir2_off_t; + +/* + * For getdents, argument struct for put routines. + */ +typedef int (*xfs_dir2_put_t)(struct xfs_dir2_put_args *pa); +typedef struct xfs_dir2_put_args { + xfs_off_t cook; /* cookie of (next) entry */ + xfs_intino_t ino; /* inode number */ + struct xfs_dirent *dbp; /* buffer pointer */ + char *name; /* directory entry name */ + int namelen; /* length of name */ + int done; /* output: set if value was stored */ + xfs_dir2_put_t put; /* put function ptr (i/o) */ + struct uio *uio; /* uio control structure */ +} xfs_dir2_put_args_t; + +#define XFS_DIR_IS_V2(mp) ((mp)->m_dirversion == 2) +extern xfs_dirops_t xfsv2_dirops; + +/* + * Other interfaces used by the rest of the dir v2 code. + */ +extern int + xfs_dir2_grow_inode(struct xfs_da_args *args, int space, + xfs_dir2_db_t *dbp); + +extern int + xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *vp); + +extern int + xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *vp); + +extern int + xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, + struct xfs_dabuf *bp); + +#endif /* __XFS_DIR2_H__ */ diff -uNr -Xdontdiff -p linux-2.4.19/fs/xfs/xfs_dir2_block.c linux-2.4-xfs/fs/xfs/xfs_dir2_block.c --- linux-2.4.19/fs/xfs/xfs_dir2_block.c Thu Jan 1 01:00:00 1970 +++ linux-2.4-xfs/fs/xfs/xfs_dir2_block.c Wed Jul 10 23:14:00 2002 @@ -0,0 +1,1231 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * xfs_dir2_block.c + * XFS V2 directory implementation, single-block form. + * See xfs_dir2_block.h for the format. + */ + +#include + + +/* + * Local function prototypes. + */ +static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, xfs_dabuf_t *bp, int first, + int last); +static void xfs_dir2_block_log_tail(xfs_trans_t *tp, xfs_dabuf_t *bp); +static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **bpp, + int *entno); +static int xfs_dir2_block_sort(const void *a, const void *b); + +/* + * Add an entry to a block directory. + */ +int /* error */ +xfs_dir2_block_addname( + xfs_da_args_t *args) /* directory op arguments */ +{ + xfs_dir2_data_free_t *bf; /* bestfree table in block */ + xfs_dir2_block_t *block; /* directory block structure */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + xfs_dabuf_t *bp; /* buffer for block */ + xfs_dir2_block_tail_t *btp; /* block tail */ + int compact; /* need to compact leaf ents */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_inode_t *dp; /* directory inode */ + xfs_dir2_data_unused_t *dup; /* block unused entry */ + int error; /* error return value */ + xfs_dir2_data_unused_t *enddup=NULL; /* unused at end of data */ + xfs_dahash_t hash; /* hash value of found entry */ + int high; /* high index for binary srch */ + int highstale; /* high stale index */ + int lfloghigh=0; /* last final leaf to log */ + int lfloglow=0; /* first final leaf to log */ + int len; /* length of the new entry */ + int low; /* low index for binary srch */ + int lowstale; /* low stale index */ + int mid=0; /* midpoint for binary srch */ + xfs_mount_t *mp; /* filesystem mount point */ + int needlog; /* need to log header */ + int needscan; /* need to rescan freespace */ + xfs_dir2_data_off_t *tagp; /* pointer to tag value */ + xfs_trans_t *tp; /* transaction structure */ + + xfs_dir2_trace_args("block_addname", args); + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + /* + * Read the (one and only) directory block into dabuf bp. + */ + if ((error = + xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) { + return error; + } + ASSERT(bp != NULL); + block = bp->data; + /* + * Check the magic number, corrupted if wrong. + */ + if (INT_GET(block->hdr.magic, ARCH_CONVERT) != XFS_DIR2_BLOCK_MAGIC) { + xfs_da_brelse(tp, bp); + return XFS_ERROR(EFSCORRUPTED); + } + len = XFS_DIR2_DATA_ENTSIZE(args->namelen); + /* + * Set up pointers to parts of the block. + */ + bf = block->hdr.bestfree; + btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + blp = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT); + /* + * No stale entries? Need space for entry and new leaf. + */ + if (INT_ISZERO(btp->stale, ARCH_CONVERT)) { + /* + * Tag just before the first leaf entry. + */ + tagp = (xfs_dir2_data_off_t *)blp - 1; + /* + * Data object just before the first leaf entry. + */ + enddup = (xfs_dir2_data_unused_t *)((char *)block + INT_GET(*tagp, ARCH_CONVERT)); + /* + * If it's not free then can't do this add without cleaning up: + * the space before the first leaf entry needs to be free so it + * can be expanded to hold the pointer to the new entry. + */ + if (INT_GET(enddup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG) + dup = enddup = NULL; + /* + * Check out the biggest freespace and see if it's the same one. + */ + else { + dup = (xfs_dir2_data_unused_t *) + ((char *)block + INT_GET(bf[0].offset, ARCH_CONVERT)); + if (dup == enddup) { + /* + * It is the biggest freespace, is it too small + * to hold the new leaf too? + */ + if (INT_GET(dup->length, ARCH_CONVERT) < len + (uint)sizeof(*blp)) { + /* + * Yes, we use the second-largest + * entry instead if it works. + */ + if (INT_GET(bf[1].length, ARCH_CONVERT) >= len) + dup = (xfs_dir2_data_unused_t *) + ((char *)block + + INT_GET(bf[1].offset, ARCH_CONVERT)); + else + dup = NULL; + } + } else { + /* + * Not the same free entry, + * just check its length. + */ + if (INT_GET(dup->length, ARCH_CONVERT) < len) { + dup = NULL; + } + } + } + compact = 0; + } + /* + * If there are stale entries we'll use one for the leaf. + * Is the biggest entry enough to avoid compaction? + */ + else if (INT_GET(bf[0].length, ARCH_CONVERT) >= len) { + dup = (xfs_dir2_data_unused_t *) + ((char *)block + INT_GET(bf[0].offset, ARCH_CONVERT)); + compact = 0; + } + /* + * Will need to compact to make this work. + */ + else { + /* + * Tag just before the first leaf entry. + */ + tagp = (xfs_dir2_data_off_t *)blp - 1; + /* + * Data object just before the first leaf entry. + */ + dup = (xfs_dir2_data_unused_t *)((char *)block + INT_GET(*tagp, ARCH_CONVERT)); + /* + * If it's not free then the data will go where the + * leaf data starts now, if it works at all. + */ + if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) { + if (INT_GET(dup->length, ARCH_CONVERT) + (INT_GET(btp->stale, ARCH_CONVERT) - 1) * + (uint)sizeof(*blp) < len) + dup = NULL; + } else if ((INT_GET(btp->stale, ARCH_CONVERT) - 1) * (uint)sizeof(*blp) < len) + dup = NULL; + else + dup = (xfs_dir2_data_unused_t *)blp; + compact = 1; + } + /* + * If this isn't a real add, we're done with the buffer. + */ + if (args->justcheck) + xfs_da_brelse(tp, bp); + /* + * If we don't have space for the new entry & leaf ... + */ + if (!dup) { + /* + * Not trying to actually do anything, or don't have + * a space reservation: return no-space. + */ + if (args->justcheck || args->total == 0) + return XFS_ERROR(ENOSPC); + /* + * Convert to the next larger format. + * Then add the new entry in that format. + */ + error = xfs_dir2_block_to_leaf(args, bp); + xfs_da_buf_done(bp); + if (error) + return error; + return xfs_dir2_leaf_addname(args); + } + /* + * Just checking, and it would work, so say so. + */ + if (args->justcheck) + return 0; + needlog = needscan = 0; + /* + * If need to compact the leaf entries, do it now. + * Leave the highest-numbered stale entry stale. + * XXX should be the one closest to mid but mid is not yet computed. + */ + if (compact) { + int fromidx; /* source leaf index */ + int toidx; /* target leaf index */ + + for (fromidx = toidx = INT_GET(btp->count, ARCH_CONVERT) - 1, + highstale = lfloghigh = -1; + fromidx >= 0; + fromidx--) { + if (INT_GET(blp[fromidx].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR) { + if (highstale == -1) + highstale = toidx; + else { + if (lfloghigh == -1) + lfloghigh = toidx; + continue; + } + } + if (fromidx < toidx) + blp[toidx] = blp[fromidx]; + toidx--; + } + lfloglow = toidx + 1 - (INT_GET(btp->stale, ARCH_CONVERT) - 1); + lfloghigh -= INT_GET(btp->stale, ARCH_CONVERT) - 1; + INT_MOD(btp->count, ARCH_CONVERT, -(INT_GET(btp->stale, ARCH_CONVERT) - 1)); + xfs_dir2_data_make_free(tp, bp, + (xfs_dir2_data_aoff_t)((char *)blp - (char *)block), + (xfs_dir2_data_aoff_t)((INT_GET(btp->stale, ARCH_CONVERT) - 1) * sizeof(*blp)), + &needlog, &needscan); + blp += INT_GET(btp->stale, ARCH_CONVERT) - 1; + INT_SET(btp->stale, ARCH_CONVERT, 1); + /* + * If we now need to rebuild the bestfree map, do so. + * This needs to happen before the next call to use_free. + */ + if (needscan) { + xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, + &needlog, NULL); + needscan = 0; + } + } + /* + * Set leaf logging boundaries to impossible state. + * For the no-stale case they're set explicitly. + */ + else if (INT_GET(btp->stale, ARCH_CONVERT)) { + lfloglow = INT_GET(btp->count, ARCH_CONVERT); + lfloghigh = -1; + } + /* + * Find the slot that's first lower than our hash value, -1 if none. + */ + for (low = 0, high = INT_GET(btp->count, ARCH_CONVERT) - 1; low <= high; ) { + mid = (low + high) >> 1; + if ((hash = INT_GET(blp[mid].hashval, ARCH_CONVERT)) == args->hashval) + break; + if (hash < args->hashval) + low = mid + 1; + else + high = mid - 1; + } + while (mid >= 0 && INT_GET(blp[mid].hashval, ARCH_CONVERT) >= args->hashval) { + mid--; + } + /* + * No stale entries, will use enddup space to hold new leaf. + */ + if (INT_ISZERO(btp->stale, ARCH_CONVERT)) { + /* + * Mark the space needed for the new leaf entry, now in use. + */ + xfs_dir2_data_use_free(tp, bp, enddup, + (xfs_dir2_data_aoff_t) + ((char *)enddup - (char *)block + INT_GET(enddup->length, ARCH_CONVERT) - + sizeof(*blp)), + (xfs_dir2_data_aoff_t)sizeof(*blp), + &needlog, &needscan); + /* + * Update the tail (entry count). + */ + INT_MOD(btp->count, ARCH_CONVERT, +1); + /* + * If we now need to rebuild the bestfree map, do so. + * This needs to happen before the next call to use_free. + */ + if (needscan) { + xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, + &needlog, NULL); + needscan = 0; + } + /* + * Adjust pointer to the first leaf entry, we're about to move + * the table up one to open up space for the new leaf entry. + * Then adjust our index to match. + */ + blp--; + mid++; + if (mid) + ovbcopy(&blp[1], blp, mid * sizeof(*blp)); + lfloglow = 0; + lfloghigh = mid; + } + /* + * Use a stale leaf for our new entry. + */ + else { + for (lowstale = mid; + lowstale >= 0 && + INT_GET(blp[lowstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR; + lowstale--) + continue; + for (highstale = mid + 1; + highstale < INT_GET(btp->count, ARCH_CONVERT) && + INT_GET(blp[highstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR && + (lowstale < 0 || mid - lowstale > highstale - mid); + highstale++) + continue; + /* + * Move entries toward the low-numbered stale entry. + */ + if (lowstale >= 0 && + (highstale == INT_GET(btp->count, ARCH_CONVERT) || + mid - lowstale <= highstale - mid)) { + if (mid - lowstale) + ovbcopy(&blp[lowstale + 1], &blp[lowstale], + (mid - lowstale) * sizeof(*blp)); + lfloglow = MIN(lowstale, lfloglow); + lfloghigh = MAX(mid, lfloghigh); + } + /* + * Move entries toward the high-numbered stale entry. + */ + else { + ASSERT(highstale < INT_GET(btp->count, ARCH_CONVERT)); + mid++; + if (highstale - mid) + ovbcopy(&blp[mid], &blp[mid + 1], + (highstale - mid) * sizeof(*blp)); + lfloglow = MIN(mid, lfloglow); + lfloghigh = MAX(highstale, lfloghigh); + } + INT_MOD(btp->stale, ARCH_CONVERT, -1); + } + /* + * Point to the new data entry. + */ + dep = (xfs_dir2_data_entry_t *)dup; + /* + * Fill in the leaf entry. + */ + INT_SET(blp[mid].hashval, ARCH_CONVERT, args->hashval); + INT_SET(blp[mid].address, ARCH_CONVERT, XFS_DIR2_BYTE_TO_DATAPTR(mp, (char *)dep - (char *)block)); + xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh); + /* + * Mark space for the data entry used. + */ + xfs_dir2_data_use_free(tp, bp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)block), + (xfs_dir2_data_aoff_t)len, &needlog, &needscan); + /* + * Create the new data entry. + */ + INT_SET(dep->inumber, ARCH_CONVERT, args->inumber); + dep->namelen = args->namelen; + bcopy(args->name, dep->name, args->namelen); + tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); + INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)block)); + /* + * Clean up the bestfree array and log the header, tail, and entry. + */ + if (needscan) + xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog, + NULL); + if (needlog) + xfs_dir2_data_log_header(tp, bp); + xfs_dir2_block_log_tail(tp, bp); + xfs_dir2_data_log_entry(tp, bp, dep); + xfs_dir2_data_check(dp, bp); + xfs_da_buf_done(bp); + return 0; +} + +/* + * Readdir for block directories. + */ +int /* error */ +xfs_dir2_block_getdents( + xfs_trans_t *tp, /* transaction (NULL) */ + xfs_inode_t *dp, /* incore inode */ + uio_t *uio, /* caller's buffer control */ + int *eofp, /* eof reached? (out) */ + xfs_dirent_t *dbp, /* caller's buffer */ + xfs_dir2_put_t put) /* abi's formatting function */ +{ + xfs_dir2_block_t *block; /* directory block structure */ + xfs_dabuf_t *bp; /* buffer for block */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_dir2_data_unused_t *dup; /* block unused entry */ + char *endptr; /* end of the data entries */ + int error; /* error return value */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_put_args_t p; /* arg package for put rtn */ + char *ptr; /* current data entry */ + char *savptr; /* saved data entry */ + int wantoff; /* starting block offset */ + + mp = dp->i_mount; + /* + * If the block number in the offset is out of range, we're done. + */ + if (XFS_DIR2_DATAPTR_TO_DB(mp, uio->uio_offset) > mp->m_dirdatablk) { + *eofp = 1; + return 0; + } + /* + * Can't read the block, give up, else get dabuf in bp. + */ + if ((error = + xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) { + return error; + } + ASSERT(bp != NULL); + /* + * Extract the byte offset we start at from the seek pointer. + * We'll skip entries before this. + */ + wantoff = XFS_DIR2_DATAPTR_TO_OFF(mp, uio->uio_offset); + block = bp->data; + xfs_dir2_data_check(dp, bp); + /* + * Set up values for the loop. + */ + btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + ptr = (char *)block->u; + endptr = (char *)XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT); + p.dbp = dbp; + p.put = put; + p.uio = uio; + /* + * Loop over the data portion of the block. + * Each object is a real entry (dep) or an unused one (dup). + */ + while (ptr < endptr) { + dup = (xfs_dir2_data_unused_t *)ptr; + /* + * Unused, skip it. + */ + if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) { + ptr += INT_GET(dup->length, ARCH_CONVERT); + continue; + } + + dep = (xfs_dir2_data_entry_t *)ptr; + + savptr = ptr; /* In case we need it.. */ + + /* + * Bump pointer for the next iteration. + */ + ptr += XFS_DIR2_DATA_ENTSIZE(dep->namelen); + /* + * The entry is before the desired starting point, skip it. + */ + if ((char *)dep - (char *)block < wantoff) + continue; + /* + * Set up argument structure for put routine. + */ + p.namelen = dep->namelen; + + /* + * NOTE! Linux "filldir" semantics require that the + * offset "cookie" be for this entry, not the + * next; all the actual shuffling to make it + * "look right" to the user is done in filldir. + */ + p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, + savptr - (char *)block); +#if XFS_BIG_FILESYSTEMS + p.ino = INT_GET(dep->inumber, ARCH_CONVERT) + mp->m_inoadd; +#else + p.ino = INT_GET(dep->inumber, ARCH_CONVERT); +#endif + p.name = (char *)dep->name; + + /* + * Put the entry in the caller's buffer. + */ + error = p.put(&p); + + /* + * If it didn't fit, set the final offset to here & return. + */ + if (!p.done) { + uio->uio_offset = + XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, + (char *)dep - (char *)block); + xfs_da_brelse(tp, bp); + return error; + } + } + + /* + * Reached the end of the block. + * Set the offset to a nonexistent block 1 and return. + */ + *eofp = 1; + + uio->uio_offset = + XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk + 1, 0); + + xfs_da_brelse(tp, bp); + + return 0; +} + +/* + * Log leaf entries from the block. + */ +static void +xfs_dir2_block_log_leaf( + xfs_trans_t *tp, /* transaction structure */ + xfs_dabuf_t *bp, /* block buffer */ + int first, /* index of first logged leaf */ + int last) /* index of last logged leaf */ +{ + xfs_dir2_block_t *block; /* directory block structure */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_mount_t *mp; /* filesystem mount point */ + + mp = tp->t_mountp; + block = bp->data; + btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + blp = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT); + xfs_da_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)block), + (uint)((char *)&blp[last + 1] - (char *)block - 1)); +} + +/* + * Log the block tail. + */ +static void +xfs_dir2_block_log_tail( + xfs_trans_t *tp, /* transaction structure */ + xfs_dabuf_t *bp) /* block buffer */ +{ + xfs_dir2_block_t *block; /* directory block structure */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_mount_t *mp; /* filesystem mount point */ + + mp = tp->t_mountp; + block = bp->data; + btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + xfs_da_log_buf(tp, bp, (uint)((char *)btp - (char *)block), + (uint)((char *)(btp + 1) - (char *)block - 1)); +} + +/* + * Look up an entry in the block. This is the external routine, + * xfs_dir2_block_lookup_int does the real work. + */ +int /* error */ +xfs_dir2_block_lookup( + xfs_da_args_t *args) /* dir lookup arguments */ +{ + xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + xfs_dabuf_t *bp; /* block buffer */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_inode_t *dp; /* incore inode */ + int ent; /* entry index */ + int error; /* error return value */ + xfs_mount_t *mp; /* filesystem mount point */ + + xfs_dir2_trace_args("block_lookup", args); + /* + * Get the buffer, look up the entry. + * If not found (ENOENT) then return, have no buffer. + */ + if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) + return error; + dp = args->dp; + mp = dp->i_mount; + block = bp->data; + xfs_dir2_data_check(dp, bp); + btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + blp = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT); + /* + * Get the offset from the leaf entry, to point to the data. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(blp[ent].address, ARCH_CONVERT))); + /* + * Fill in inode number, release the block. + */ + args->inumber = INT_GET(dep->inumber, ARCH_CONVERT); + xfs_da_brelse(args->trans, bp); + return XFS_ERROR(EEXIST); +} + +/* + * Internal block lookup routine. + */ +static int /* error */ +xfs_dir2_block_lookup_int( + xfs_da_args_t *args, /* dir lookup arguments */ + xfs_dabuf_t **bpp, /* returned block buffer */ + int *entno) /* returned entry number */ +{ + xfs_dir2_dataptr_t addr; /* data entry address */ + xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + xfs_dabuf_t *bp; /* block buffer */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_inode_t *dp; /* incore inode */ + int error; /* error return value */ + xfs_dahash_t hash; /* found hash value */ + int high; /* binary search high index */ + int low; /* binary search low index */ + int mid; /* binary search current idx */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_trans_t *tp; /* transaction pointer */ + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + /* + * Read the buffer, return error if we can't get it. + */ + if ((error = + xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) { + return error; + } + ASSERT(bp != NULL); + block = bp->data; + xfs_dir2_data_check(dp, bp); + btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + blp = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT); + /* + * Loop doing a binary search for our hash value. + * Find our entry, ENOENT if it's not there. + */ + for (low = 0, high = INT_GET(btp->count, ARCH_CONVERT) - 1; ; ) { + ASSERT(low <= high); + mid = (low + high) >> 1; + if ((hash = INT_GET(blp[mid].hashval, ARCH_CONVERT)) == args->hashval) + break; + if (hash < args->hashval) + low = mid + 1; + else + high = mid - 1; + if (low > high) { + ASSERT(args->oknoent); + xfs_da_brelse(tp, bp); + return XFS_ERROR(ENOENT); + } + } + /* + * Back up to the first one with the right hash value. + */ + while (mid > 0 && INT_GET(blp[mid - 1].hashval, ARCH_CONVERT) == args->hashval) { + mid--; + } + /* + * Now loop forward through all the entries with the + * right hash value looking for our name. + */ + do { + if ((addr = INT_GET(blp[mid].address, ARCH_CONVERT)) == XFS_DIR2_NULL_DATAPTR) + continue; + /* + * Get pointer to the entry from the leaf. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, addr)); + /* + * Compare, if it's right give back buffer & entry number. + */ + if (dep->namelen == args->namelen && + dep->name[0] == args->name[0] && + bcmp(dep->name, args->name, args->namelen) == 0) { + *bpp = bp; + *entno = mid; + return 0; + } + } while (++mid < INT_GET(btp->count, ARCH_CONVERT) && INT_GET(blp[mid].hashval, ARCH_CONVERT) == hash); + /* + * No match, release the buffer and return ENOENT. + */ + ASSERT(args->oknoent); + xfs_da_brelse(tp, bp); + return XFS_ERROR(ENOENT); +} + +/* + * Remove an entry from a block format directory. + * If that makes the block small enough to fit in shortform, transform it. + */ +int /* error */ +xfs_dir2_block_removename( + xfs_da_args_t *args) /* directory operation args */ +{ + xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_leaf_entry_t *blp; /* block leaf pointer */ + xfs_dabuf_t *bp; /* block buffer */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_inode_t *dp; /* incore inode */ + int ent; /* block leaf entry index */ + int error; /* error return value */ + xfs_mount_t *mp; /* filesystem mount point */ + int needlog; /* need to log block header */ + int needscan; /* need to fixup bestfree */ + xfs_dir2_sf_hdr_t sfh; /* shortform header */ + int size; /* shortform size */ + xfs_trans_t *tp; /* transaction pointer */ + + xfs_dir2_trace_args("block_removename", args); + /* + * Look up the entry in the block. Gets the buffer and entry index. + * It will always be there, the vnodeops level does a lookup first. + */ + if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) { + return error; + } + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + block = bp->data; + btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + blp = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT); + /* + * Point to the data entry using the leaf entry. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(blp[ent].address, ARCH_CONVERT))); + /* + * Mark the data entry's space free. + */ + needlog = needscan = 0; + xfs_dir2_data_make_free(tp, bp, + (xfs_dir2_data_aoff_t)((char *)dep - (char *)block), + XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan); + /* + * Fix up the block tail. + */ + INT_MOD(btp->stale, ARCH_CONVERT, +1); + xfs_dir2_block_log_tail(tp, bp); + /* + * Remove the leaf entry by marking it stale. + */ + INT_SET(blp[ent].address, ARCH_CONVERT, XFS_DIR2_NULL_DATAPTR); + xfs_dir2_block_log_leaf(tp, bp, ent, ent); + /* + * Fix up bestfree, log the header if necessary. + */ + if (needscan) + xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog, + NULL); + if (needlog) + xfs_dir2_data_log_header(tp, bp); + xfs_dir2_data_check(dp, bp); + /* + * See if the size as a shortform is good enough. + */ + if ((size = xfs_dir2_block_sfsize(dp, block, &sfh)) > + XFS_IFORK_DSIZE(dp)) { + xfs_da_buf_done(bp); + return 0; + } + /* + * If it works, do the conversion. + */ + return xfs_dir2_block_to_sf(args, bp, size, &sfh); +} + +/* + * Replace an entry in a V2 block directory. + * Change the inode number to the new value. + */ +int /* error */ +xfs_dir2_block_replace( + xfs_da_args_t *args) /* directory operation args */ +{ + xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + xfs_dabuf_t *bp; /* block buffer */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_inode_t *dp; /* incore inode */ + int ent; /* leaf entry index */ + int error; /* error return value */ + xfs_mount_t *mp; /* filesystem mount point */ + + xfs_dir2_trace_args("block_replace", args); + /* + * Lookup the entry in the directory. Get buffer and entry index. + * This will always succeed since the caller has already done a lookup. + */ + if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) { + return error; + } + dp = args->dp; + mp = dp->i_mount; + block = bp->data; + btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + blp = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT); + /* + * Point to the data entry we need to change. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(blp[ent].address, ARCH_CONVERT))); + ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) != args->inumber); + /* + * Change the inode number to the new value. + */ + INT_SET(dep->inumber, ARCH_CONVERT, args->inumber); + xfs_dir2_data_log_entry(args->trans, bp, dep); + xfs_dir2_data_check(dp, bp); + xfs_da_buf_done(bp); + return 0; +} + +/* + * Qsort comparison routine for the block leaf entries. + */ +static int /* sort order */ +xfs_dir2_block_sort( + const void *a, /* first leaf entry */ + const void *b) /* second leaf entry */ +{ + const xfs_dir2_leaf_entry_t *la; /* first leaf entry */ + const xfs_dir2_leaf_entry_t *lb; /* second leaf entry */ + + la = a; + lb = b; + return INT_GET(la->hashval, ARCH_CONVERT) < INT_GET(lb->hashval, ARCH_CONVERT) ? -1 : + (INT_GET(la->hashval, ARCH_CONVERT) > INT_GET(lb->hashval, ARCH_CONVERT) ? 1 : 0); +} + +/* + * Convert a V2 leaf directory to a V2 block directory if possible. + */ +int /* error */ +xfs_dir2_leaf_to_block( + xfs_da_args_t *args, /* operation arguments */ + xfs_dabuf_t *lbp, /* leaf buffer */ + xfs_dabuf_t *dbp) /* data buffer */ +{ + xfs_dir2_data_off_t *bestsp; /* leaf bests table */ + xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_data_unused_t *dup; /* unused data entry */ + int error; /* error return value */ + int from; /* leaf from index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + xfs_mount_t *mp; /* file system mount point */ + int needlog; /* need to log data header */ + int needscan; /* need to scan for bestfree */ + xfs_dir2_sf_hdr_t sfh; /* shortform header */ + int size; /* bytes used */ + xfs_dir2_data_off_t *tagp; /* end of entry (tag) */ + int to; /* block/leaf to index */ + xfs_trans_t *tp; /* transaction pointer */ + + xfs_dir2_trace_args_bb("leaf_to_block", args, lbp, dbp); + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + leaf = lbp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC); + ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + /* + * If there are data blocks other than the first one, take this + * opportunity to remove trailing empty data blocks that may have + * been left behind during no-space-reservation operations. + * These will show up in the leaf bests table. + */ + while (dp->i_d.di_size > mp->m_dirblksize) { + bestsp = XFS_DIR2_LEAF_BESTS_P_ARCH(ltp, ARCH_CONVERT); + if (INT_GET(bestsp[INT_GET(ltp->bestcount, ARCH_CONVERT) - 1], ARCH_CONVERT) == + mp->m_dirblksize - (uint)sizeof(block->hdr)) { + if ((error = + xfs_dir2_leaf_trim_data(args, lbp, + (xfs_dir2_db_t)(INT_GET(ltp->bestcount, ARCH_CONVERT) - 1)))) + goto out; + } else { + error = 0; + goto out; + } + } + /* + * Read the data block if we don't already have it, give up if it fails. + */ + if (dbp == NULL && + (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp, + XFS_DATA_FORK))) { + goto out; + } + block = dbp->data; + ASSERT(INT_GET(block->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC); + /* + * Size of the "leaf" area in the block. + */ + size = (uint)sizeof(block->tail) + + (uint)sizeof(*lep) * (INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT)); + /* + * Look at the last data entry. + */ + tagp = (xfs_dir2_data_off_t *)((char *)block + mp->m_dirblksize) - 1; + dup = (xfs_dir2_data_unused_t *)((char *)block + INT_GET(*tagp, ARCH_CONVERT)); + /* + * If it's not free or is too short we can't do it. + */ + if (INT_GET(dup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG || INT_GET(dup->length, ARCH_CONVERT) < size) { + error = 0; + goto out; + } + /* + * Start converting it to block form. + */ + INT_SET(block->hdr.magic, ARCH_CONVERT, XFS_DIR2_BLOCK_MAGIC); + needlog = 1; + needscan = 0; + /* + * Use up the space at the end of the block (blp/btp). + */ + xfs_dir2_data_use_free(tp, dbp, dup, mp->m_dirblksize - size, size, + &needlog, &needscan); + /* + * Initialize the block tail. + */ + btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + INT_SET(btp->count, ARCH_CONVERT, INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT)); + INT_ZERO(btp->stale, ARCH_CONVERT); + xfs_dir2_block_log_tail(tp, dbp); + /* + * Initialize the block leaf area. We compact out stale entries. + */ + lep = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT); + for (from = to = 0; from < INT_GET(leaf->hdr.count, ARCH_CONVERT); from++) { + if (INT_GET(leaf->ents[from].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR) + continue; + lep[to++] = leaf->ents[from]; + } + ASSERT(to == INT_GET(btp->count, ARCH_CONVERT)); + xfs_dir2_block_log_leaf(tp, dbp, 0, INT_GET(btp->count, ARCH_CONVERT) - 1); + /* + * Scan the bestfree if we need it and log the data block header. + */ + if (needscan) + xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog, + NULL); + if (needlog) + xfs_dir2_data_log_header(tp, dbp); + /* + * Pitch the old leaf block. + */ + error = xfs_da_shrink_inode(args, mp->m_dirleafblk, lbp); + lbp = NULL; + if (error) { + goto out; + } + /* + * Now see if the resulting block can be shrunken to shortform. + */ + if ((size = xfs_dir2_block_sfsize(dp, block, &sfh)) > + XFS_IFORK_DSIZE(dp)) { + error = 0; + goto out; + } + return xfs_dir2_block_to_sf(args, dbp, size, &sfh); +out: + if (lbp) + xfs_da_buf_done(lbp); + if (dbp) + xfs_da_buf_done(dbp); + return error; +} + +/* + * Convert the shortform directory to block form. + */ +int /* error */ +xfs_dir2_sf_to_block( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_dir2_db_t blkno; /* dir-relative block # (0) */ + xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + xfs_dabuf_t *bp; /* block buffer */ + xfs_dir2_block_tail_t *btp; /* block tail pointer */ + char *buf; /* sf buffer */ + int buf_len; + xfs_dir2_data_entry_t *dep; /* data entry pointer */ + xfs_inode_t *dp; /* incore directory inode */ + int dummy; /* trash */ + xfs_dir2_data_unused_t *dup; /* unused entry pointer */ + int endoffset; /* end of data objects */ + int error; /* error return value */ + int i; /* index */ + xfs_mount_t *mp; /* filesystem mount point */ + int needlog; /* need to log block header */ + int needscan; /* need to scan block freespc */ + int newoffset; /* offset from current entry */ + int offset; /* target block offset */ + xfs_dir2_sf_entry_t *sfep; /* sf entry pointer */ + xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_dir2_data_off_t *tagp; /* end of data entry */ + xfs_trans_t *tp; /* transaction pointer */ + + xfs_dir2_trace_args("sf_to_block", args); + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Bomb out if the shortform directory is way too short. + */ + if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + return XFS_ERROR(EIO); + } + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); + /* + * Copy the directory into the stack buffer.