public inbox for git@vger.kernel.org 
 help / color / mirror / Atom feed
From: Robin Rosenberg <robin.rosenberg.lists@dewire•com>
To: "Torsten Bögershausen" <tboegi@web•de>
Cc: git@vger•kernel.org
Subject: Re: [RFC] i18n.pathencoding
Date: Mon, 03 Sep 2012 00:59:00 +0200	[thread overview]
Message-ID: <5043E4B4.9050801@dewire.com> (raw)
In-Reply-To: <201209010811.33994.tboegi@web.de>

Torsten Bögershausen skrev 2012-09-01 08.11:> Allow path names to be encoded in UTF-8 in the repository
 > and checkout out as e.g. ISO-8859-1 in the working tree.

Ack for attempting this.

Did it myself if 2007, but times weren't ripe then, I guess.

 > +i18n.pathEncoding::
 > +	This option is only used by some implementations of git.
 > +	When "git init" sets core.supportspathencoding to true,
 > +	i18n.pathEncoding can be set to re-encode path names when
 > +	a working tree is checked out.
 > +	Path names may be e.g. encoded in ISO-8859-1 and are stored as
 > +	UTF-8 encoded in the repository.
 > +	When not set, the encoding of path names is the same in working tree
 > +	and the repository.

"If set, then core.precomposeunicode is ignored on Mac OS X."

 > diff --git a/compat/reencode_pathname.c b/compat/reencode_pathname.c
 > new file mode 100644
 > index 0000000..3bdc776
 > --- /dev/null
 > +++ b/compat/reencode_pathname.c
 > @@ -0,0 +1,441 @@
 > +/*
 > + * Converts pathnames from one encoding into another.
 > + * The pathnames are stored as UTF-8 in the repository,
 > + * and might be checkout out as e.g. ISO-8859-1 in the working tree
 > + *
 > + * On MacOS X decomposed unicode is converted into precomposed unicode.
, ignoring the setting of core.precomposeunicode.

[...]
 > + */
 > +
 > +#define REENCODE_PATHNAME_C
 > +#include "cache.h"
 > +#include "utf8.h"
 > +#include "reencode_pathname.h"
 > +
 > +#if defined(OLD_ICONV) || (defined(__sun__) && !defined(_XPG6))
 > +	typedef const char *iconv_ibp;
 > +#else
 > +	typedef char *iconv_ibp;
 > +#endif
 > +
 > +const static char *repo_path_encoding = "UTF-8";
 > +
 > +static iconv_t iconv_open_or_die(const char *tocode, const char *fromcode)
 > +{
 > +	iconv_t my_iconv;
 > +	my_iconv = iconv_open(tocode, fromcode);
join these two lines

 > +	if (my_iconv == (iconv_t) -1)
 > +		die_errno(_("iconv_open(%s,%s) failed"), tocode, fromcode);
 > +	return my_iconv;
 > +}
 > +
 > +static size_t has_non_ascii(const char *s, size_t maxlen, size_t *strlen_c)
 > +{
 > +	const uint8_t *ptr = (const uint8_t *)s;
 > +	size_t strlen_chars = 0;
 > +	size_t ret = 0;
 > +
 > +	if (!ptr || !*ptr)
 > +		return 0;
 > +
 > +	while (*ptr && maxlen) {
 > +		if (*ptr & 0x80)
 > +			ret++;
 > +		strlen_chars++;
 > +		ptr++;
 > +		maxlen--;
 > +	}
 > +	if (strlen_c)
 > +		*strlen_c = strlen_chars;
 > +
 > +	return ret;
 > +}
 > +
 > +#ifdef PRECOMPOSE_UNICODE
 > +void probe_utf8_pathname_composition(char *path, int len)
 > +{
 > +	static const char *auml_nfc = "\xc3\xa4";
 > +	static const char *auml_nfd = "\x61\xcc\x88";
 > +	int output_fd;
 > +	if (precomposed_unicode != -1)
 > +		return; /* We found it defined in the global config, respect it */
a bland line here would be nice

 > +	strcpy(path + len, auml_nfc);
 > +	output_fd = open(path, O_CREAT|O_EXCL|O_RDWR, 0600);
 > +	if (output_fd >= 0) {
 > +		close(output_fd);
 > +		strcpy(path + len, auml_nfd);
 > +		/* Indicate to the user, that we can configure it to true */
 > +		if (!access(path, R_OK))
 > +			git_config_set("core.precomposeunicode", "false");
 > +		/* To be backward compatible, set precomposed_unicode to 0 */
 > +		precomposed_unicode = 0;
 > +		strcpy(path + len, auml_nfc);
 > +		if (unlink(path))
 > +			die_errno(_("failed to unlink '%s'"), path);
 > +	}
 > +}
 > +#endif

[...]

 > +struct dirent_psx *renc_pn_readdir(RENC_FN_DIR *renc_pn_dir)
 > +{
 > +	struct dirent *res;
 > +	res = readdir(renc_pn_dir->dirp);
 > +	if (res) {
 > +		size_t namelenz = strlen(res->d_name) + 1; /* \0 */
 > +		size_t new_len_needed = 0;
 > +		int ret_errno = errno;
 > +
 > +		renc_pn_dir->dirent_utf8->d_ino	 = res->d_ino;
 > +		renc_pn_dir->dirent_utf8->d_type = res->d_type;
 > +	do {
 > +		 if (new_len_needed > renc_pn_dir->dirent_utf8->max_name_len) {
indent

[...]

 > diff --git a/environment.c b/environment.c
 > index 85edd7f..ba81575 100644
 > --- a/environment.c
 > +++ b/environment.c
 > @@ -59,6 +59,7 @@ int grafts_replace_parents = 1;
 >   int core_apply_sparse_checkout;
 >   int merge_log_config = -1;
 >   int precomposed_unicode = -1; /* see probe_utf8_pathname_composition() */
 > +const char *wt_path_encoding = NULL;
indent

 >   struct startup_info *startup_info;
 >   unsigned long pack_size_limit_cfg;
 >
 > diff --git a/git-compat-util.h b/git-compat-util.h
 > index 35b095e..877b060 100644
 > --- a/git-compat-util.h
 > +++ b/git-compat-util.h
 > @@ -153,13 +153,21 @@
 >   #endif
 >   #endif
 >
 > -/* used on Mac OS X */
 > -#ifdef PRECOMPOSE_UNICODE
 > -#include "compat/precompose_utf8.h"
 > +#if defined(PATH_ENCODING) || defined(PRECOMPOSE_UNICODE)
 > +#include "compat/reencode_pathname.h"
 >   #else
 > -#define precompose_str(in,i_nfd2nfc)
 > -#define precompose_argv(c,v)
 > -#define probe_utf8_pathname_composition(a,b)
 > +#define reencode_argv(c,v)
 > +#endif
 > +
 > +/* needed for Mac OS X */
 > +#ifndef PRECOMPOSE_UNICODE
 > +#define probe_utf8_pathname_composition(a,b);
 > +#endif
 > +
 > +#ifndef PATH_ENCODING
 > +#define str_worktree2repolen(in, insz) (NULL)
 > +#define str_repo2worktree(in) (NULL)
 > +#define str_worktree2repo(in) (NULL)
 >   #endif
 >
 >   #ifndef NO_LIBGEN_H
 > diff --git a/parse-options.c b/parse-options.c
 > index c1c66bd..5840c18 100644
 > --- a/parse-options.c
 > +++ b/parse-options.c
 > @@ -476,7 +476,7 @@ int parse_options(int argc, const char **argv, const char *prefix,
 >   		usage_with_options(usagestr, options);
 >   	}
 >
 > -	precompose_argv(argc, argv);
 > +	reencode_argv(argc, argv);
 >   	return parse_options_end(&ctx);
 >   }
 >
 > diff --git a/t/t3911-i18n-filename-8859.sh b/t/t3911-i18n-filename-8859.sh
 > new file mode 100755
 > index 0000000..aa2be57
 > --- /dev/null
 > +++ b/t/t3911-i18n-filename-8859.sh
 > @@ -0,0 +1,251 @@
 > +#!/bin/sh
 > +#
 > +# Copyright (c) 2010 Torsten Bögershausen
 > +#
 > +
 > +test_description='file system encodings UTF-8 ISO8859-1'
 > +
 > +. ./test-lib.sh
 > +
 > +fname_UTF_8=`printf '\303\206\302\242'`
 > +fname_ISO8859_1=`printf '\306\242'`
 > +Euro_utf8=`printf '\342\202\254'`
 > +supportspathencoding=`git config core.supportspathencoding` || :
 > +
 > +
 > +add_file_dir_link() {
 > +	local bname=$1
 > +	local fname=$2
 > +	test_expect_success "add file $fname.f $bname" '
 > +		git checkout master &&
 > +		git checkout -b add_f_$bname &&
 > +		>$fname.f &&
 > +		git add $fname.f &&
 > +		git commit -m "add fname"
 > +	'
 > +
 > +	test_expect_success "add dir $fname.d $bname" '
 > +		git checkout master &&
 > +		git checkout -b add_d_$bname &&
 > +		mkdir $fname.d &&
 > +		touch $fname.d/$fname.f &&
 > +		git add $fname.d/$fname.f &&
 > +		git commit -m "add fname.d/fname"
 > +	'
 > +
 > +	i=0
 > +	for src in x $fname; do
 > +		for dst in x $fname; do
 > +			test_expect_success "add link $dst.l->$src.f on branch add_l_${i}_$bname" '
 > +				git checkout master &&
 > +				git checkout -b add_l_${i}_$bname &&
 > +				ln -s $src.f $dst.l &&
 > +				git add $dst.l &&
 > +				git commit -m "add fname.l $i"
 > +			'
 > +			i=$(($i+1))
 > +		done
 > +	done
 > +}
 > +
 > +test_expect_success "setup add rm x" '
 > +	>x &&
 > +	git add x &&
 > +	git commit -m "1st commit" &&
 > +	git rm x &&
 > +	git commit -m "rm x"
 > +'
 > +
 > +#combinations to be tested:
 > +# UTF-8     -> ISO8859-1
 > +# ISO8859-1 -> UTF-8
 > +
 > +if test "$supportspathencoding"
 > +then
 > +	srcencodings="ISO8859-1 UTF-8"
 > +	for srcenc in $srcencodings
 > +	do
 > +		case $srcenc in
 > +		ISO8859-1)
 > +			dstenc=UTF-8
 > +		;;
 > +		UTF-8)
 > +			dstenc=ISO8859-1
 > +		;;
 > +		UTF-8-MAC)
 > +			dstenc=UTF-8
 > +		;;
 > +		*)
 > +			echo >&2 "Wrong encoding $srcenc"
 > +			exit 1
 > +		;;
 > +		esac
 > +		eval fname_src=\$fname_$(echo $srcenc | sed -e 's/-/_/g' -e 's/_MAC//')
 > +		eval fname_dst=\$fname_$(echo $dstenc | sed -e 's/-/_/g')
 > +		test_expect_success "setup $srcenc" '
 > +			git checkout master &&
 > +			git config i18n.pathencoding $srcenc
 > +		'
 > +		add_file_dir_link $srcenc $fname_src
 > +
 > +		test_expect_success "setup $dstenc" '
 > +			git checkout master &&
 > +			echo "git checkout Master" >&2
 > +			ls -l >&2
 > +			git config i18n.pathencoding $dstenc
 > +		'
 > +
 > +		test_expect_success "checkout file $dstenc (was $srcenc)" '
 > +			git checkout add_f_$srcenc
 > +		'
 > +
 > +		test_expect_success "exists file $dstenc (was $srcenc)" '
 > +			test -f $fname_dst.f
 > +		'
 > +
 > +		test_expect_success "log file $dstenc (was $srcenc)" '
 > +			git log $fname_dst.f
 > +		'
 > +
 > +		test_expect_success "git mv" '
 > +			git checkout -b mv_file_$srcenc &&
 > +			git mv $fname_dst.f XX.f &&
 > +			git commit -m "git mv fname_dst.f XX.f"
 > +		'
 > +
 > +		test_expect_success "checkout dir $dstenc (was $srcenc)" '
 > +			git checkout add_d_$srcenc
 > +		'
 > +
 > +		test_expect_success "exist dir $dstenc (was $srcenc)" '
 > +			test -d $fname_dst.d
 > +		'
 > +
 > +		test_expect_success "log dir $dstenc (was $srcenc)" '
 > +			git log $fname_dst.d
 > +		'
 > +
 > +		i=0
 > +		for src in x $fname_dst; do
 > +			for dst in x $fname_dst; do
 > +				test_expect_success "checkout link $dst.l->$src.f branch add_l_${i}_$srcenc" '
 > +					git checkout add_l_${i}_$srcenc
 > +				'
 > +				test_expect_success "exist link $dst.l->$src.f branch add_l_${i}_$srcenc" '
 > +					test -L $dst.l
 > +				'
 > +				test_expect_success "log link $dst.l->$src.f branch add_l_${i}_$srcenc" '
 > +					git log $dst.l
 > +				'
 > +				test_expect_success "readlink $dst.l->$src.f branch add_l_${i}_$srcenc" '
 > +					echo "$src.f" >expect &&
 > +					readlink "$dst.l" > actual &&
 > +					test_cmp expect actual &&
 > +					rm expect actual
 > +				'
 > +				i=$(($i+1))
 > +			done
 > +		done
 > +	done
 > +	# Make sure that Euro sign can NOT be checked out in 8859
"8859-1", The euro sign exists in 8859-15.

 > +	#fname_src=Euro
 > +	test_expect_success "setup UTF-8" '
 > +		git checkout master &&
 > +		git config i18n.pathencoding UTF-8
 > +	'
 > +	add_file_dir_link Euro $Euro_utf8
 > +
 > +	test_expect_success "setup ISO8859-1" '
 > +		git checkout master &&
 > +		rm -rf * &&
 > +		git config i18n.pathencoding ISO8859-1
 > +	'
 > +	test_expect_success "checkout file Euro branch add_f_Euro" '
 > +		git checkout add_f_Euro
Missing && ?

 > +		echo *  >actual &&
 > +		echo "*" >expect &&
 > +		test_cmp expect actual &&
 > +		rm expect actual
 > +	'
 > +
 > +	test_expect_success "checkout dir Euro branch add_d_Euro" '
 > +		rm -rf * &&
 > +		test_must_fail git checkout add_d_Euro
 > +	'
 > +
 > +	test_expect_success "Cleanup" '
 > +		git config i18n.pathencoding UTF-8 &&
 > +		git checkout master &&
 > +		rm -rf * &&
 > +		git reset --hard &&
 > +		git config i18n.pathencoding ISO8859-1
 > +	'
 > +
 > +	test_expect_success "checkout link Euro.l->x.f branch add_l_1_Euro" '
 > +		! git checkout add_l_1_Euro
 > +	'
 > +
 > +	test_expect_success "No link Euro.l->x.f" '
 > +		echo *  >actual &&
 > +		echo "*" >expect &&
 > +		test_cmp expect actual &&
 > +		rm expect actual
 > +	'
 > +
 > +	test_expect_success "Cleanup after Euro.l->x.f" '
 > +		git config i18n.pathencoding UTF-8 &&
 > +		git checkout master &&
 > +		rm -rf * &&
 > +		git reset --hard &&
 > +		git config i18n.pathencoding ISO8859-1
 > +	'
 > +
 > +	# Checkoing out a soft link pointing to a filename outside
"checking"

 > +	# 8859-1 should fail
 > +	test_expect_failure "checkout link x.l->Euro.f branch add_l_2_Euro" '
 > +		! git checkout add_l_2_Euro
 > +	'
 > +
 > +	test_expect_success "No link x.f->Euro.l" '
 > +		echo *  >actual &&
 > +		echo "*" >expect &&
 > +		test_cmp expect actual &&
 > +		rm expect actual
 > +	'
 > +
 > +	test_expect_success "Cleanup after link x.l->Euro.f branch" '
 > +		git config i18n.pathencoding UTF-8 &&
 > +		git checkout master &&
 > +		rm -rf * &&
 > +		git reset --hard &&
 > +		git config i18n.pathencoding ISO8859-1
 > +	'
 > +
 > +	test_expect_success "checkout link Euro.l->Euro.f branch add_l_3_Euro" '
 > +		! git checkout add_l_3_Euro
 > +	'
 > +
 > +	test_expect_success "No link Euro.l->Euro.f" '
 > +		echo *  >actual &&
 > +		echo "*" >expect &&
 > +		test_cmp expect actual &&
 > +		rm expect actual
 > +	'
 > +
 > +else
 > +	test_expect_success "setup 8859" '
"8859-1"

 > +		git config i18n.pathencoding ISO8859-1 &&
 > +		git checkout -b add_file_8859 &&
 > +		> $fname_src.f &&
 > +		git add $fname_src.f &&
 > +		git commit -m "add fname_src" &&
 > +		git config i18n.pathencoding UTF-8 &&
 > +		rm -rf * &&
 > +		git reset --hard
 > +	'
 > +	test_expect_success "Silent support of pathencoding" '
 > +		test_must_fail test -f $fname_UTF_8.f
 > +	'
 > +fi
 > +
 > +test_done

-- robin

  reply	other threads:[~2012-09-02 23:07 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-09-01  6:11 [RFC] i18n.pathencoding Torsten Bögershausen
2012-09-02 22:59 ` Robin Rosenberg [this message]
2012-09-08 10:09   ` Torsten Bögershausen
2012-09-04 12:23 ` Nguyen Thai Ngoc Duy
2012-09-04 17:19   ` Junio C Hamano
2012-09-04 19:51     ` Torsten Bögershausen
2012-09-04 20:12       ` Junio C Hamano
2012-09-05 19:52         ` Torsten Bögershausen
2012-09-05 11:11     ` Nguyen Thai Ngoc Duy
2012-09-05 19:49       ` Torsten Bögershausen
2012-09-06  3:24         ` Junio C Hamano

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5043E4B4.9050801@dewire.com \
    --to=robin.rosenberg.lists@dewire$(echo .)com \
    --cc=git@vger$(echo .)kernel.org \
    --cc=tboegi@web$(echo .)de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox