<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Reshaping long data to wide in Statistical Procedures</title>
    <link>https://communities.sas.com/t5/Statistical-Procedures/Reshaping-long-data-to-wide/m-p/704375#M34056</link>
    <description>&lt;P&gt;If SNP values are longer than 32 characters, and identical in the first 32, then that is where your duplicates happen. Keep in mind that SAS variable names are limited to 32 characters.&lt;/P&gt;</description>
    <pubDate>Tue, 08 Dec 2020 08:46:47 GMT</pubDate>
    <dc:creator>Kurt_Bremser</dc:creator>
    <dc:date>2020-12-08T08:46:47Z</dc:date>
    <item>
      <title>Reshaping long data to wide</title>
      <link>https://communities.sas.com/t5/Statistical-Procedures/Reshaping-long-data-to-wide/m-p/704318#M34049</link>
      <description>&lt;P&gt;Hello,&lt;/P&gt;&lt;P&gt;I am relatively&amp;nbsp; new to SAS and I am working with microarray SNP data, fairly large. I need to reshape my date from long to wide. I tried the code below on a smaller data set and works well. An example data set is shown below:&lt;/P&gt;&lt;P&gt;Data MAlong&lt;/P&gt;&lt;TABLE border="0" cellspacing="0" cellpadding="0"&gt;&lt;TBODY&gt;&lt;TR&gt;&lt;TD&gt;ID&lt;/TD&gt;&lt;TD&gt;SNPs&lt;/TD&gt;&lt;TD&gt;Genotype&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;23456&lt;/TD&gt;&lt;TD&gt;rs1234&lt;/TD&gt;&lt;TD&gt;CC&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;23456&lt;/TD&gt;&lt;TD&gt;rs1235&lt;/TD&gt;&lt;TD&gt;CC&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;23456&lt;/TD&gt;&lt;TD&gt;rs1236&lt;/TD&gt;&lt;TD&gt;TT&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;23456&lt;/TD&gt;&lt;TD&gt;rs1237&lt;/TD&gt;&lt;TD&gt;AA&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;23456&lt;/TD&gt;&lt;TD&gt;rs1238&lt;/TD&gt;&lt;TD&gt;TT&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;23456&lt;/TD&gt;&lt;TD&gt;rs1239&lt;/TD&gt;&lt;TD&gt;GG&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;23456&lt;/TD&gt;&lt;TD&gt;rs1240&lt;/TD&gt;&lt;TD&gt;GG&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;23456&lt;/TD&gt;&lt;TD&gt;rs1241&lt;/TD&gt;&lt;TD&gt;TT&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;23456&lt;/TD&gt;&lt;TD&gt;rs1242&lt;/TD&gt;&lt;TD&gt;CC&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;23456&lt;/TD&gt;&lt;TD&gt;rs1243&lt;/TD&gt;&lt;TD&gt;AA&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;17235&lt;/TD&gt;&lt;TD&gt;rs1234&lt;/TD&gt;&lt;TD&gt;TT&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;17235&lt;/TD&gt;&lt;TD&gt;rs1235&lt;/TD&gt;&lt;TD&gt;GG&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;17235&lt;/TD&gt;&lt;TD&gt;rs1236&lt;/TD&gt;&lt;TD&gt;TT&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;17235&lt;/TD&gt;&lt;TD&gt;rs1237&lt;/TD&gt;&lt;TD&gt;CC&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;17235&lt;/TD&gt;&lt;TD&gt;rs1238&lt;/TD&gt;&lt;TD&gt;AA&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;17235&lt;/TD&gt;&lt;TD&gt;rs1239&lt;/TD&gt;&lt;TD&gt;AA&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;17235&lt;/TD&gt;&lt;TD&gt;rs1240&lt;/TD&gt;&lt;TD&gt;AG&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;17235&lt;/TD&gt;&lt;TD&gt;rs1241&lt;/TD&gt;&lt;TD&gt;GG&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;17235&lt;/TD&gt;&lt;TD&gt;rs1242&lt;/TD&gt;&lt;TD&gt;GG&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;17235&lt;/TD&gt;&lt;TD&gt;rs1243&lt;/TD&gt;&lt;TD&gt;TC&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;25342&lt;/TD&gt;&lt;TD&gt;rs1234&lt;/TD&gt;&lt;TD&gt;AA&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;25342&lt;/TD&gt;&lt;TD&gt;rs1235&lt;/TD&gt;&lt;TD&gt;AG&lt;/TD&gt;&lt;/TR&gt;&lt;/TBODY&gt;&lt;/TABLE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;SAS code:&lt;/P&gt;&lt;P&gt;PROC TRANSPOSE data = MAlong out= MAWide ;&lt;BR /&gt;by ID not sorted;&lt;BR /&gt;var Genotype ;&lt;BR /&gt;ID SNPs;&lt;BR /&gt;run;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Resulting data set&lt;/P&gt;&lt;P&gt;Data MAwide&lt;/P&gt;&lt;TABLE border="0" cellspacing="0" cellpadding="0"&gt;&lt;TBODY&gt;&lt;TR&gt;&lt;TD&gt;ID&lt;/TD&gt;&lt;TD&gt;rs1234&lt;/TD&gt;&lt;TD&gt;rs1235&lt;/TD&gt;&lt;TD&gt;rs1236&lt;/TD&gt;&lt;TD&gt;rs1237&lt;/TD&gt;&lt;TD&gt;rs1238&lt;/TD&gt;&lt;TD&gt;rs1239&lt;/TD&gt;&lt;TD&gt;rs1240&lt;/TD&gt;&lt;TD&gt;rs1241&lt;/TD&gt;&lt;TD&gt;rs1242&lt;/TD&gt;&lt;TD&gt;rs1243&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;23456&lt;/TD&gt;&lt;TD&gt;CC&lt;/TD&gt;&lt;TD&gt;CC&lt;/TD&gt;&lt;TD&gt;TT&lt;/TD&gt;&lt;TD&gt;AA&lt;/TD&gt;&lt;TD&gt;TT&lt;/TD&gt;&lt;TD&gt;GG&lt;/TD&gt;&lt;TD&gt;GG&lt;/TD&gt;&lt;TD&gt;TT&lt;/TD&gt;&lt;TD&gt;CC&lt;/TD&gt;&lt;TD&gt;AA&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;17235&lt;/TD&gt;&lt;TD&gt;TT&lt;/TD&gt;&lt;TD&gt;GG&lt;/TD&gt;&lt;TD&gt;TT&lt;/TD&gt;&lt;TD&gt;CC&lt;/TD&gt;&lt;TD&gt;AA&lt;/TD&gt;&lt;TD&gt;AA&lt;/TD&gt;&lt;TD&gt;AG&lt;/TD&gt;&lt;TD&gt;GG&lt;/TD&gt;&lt;TD&gt;GG&lt;/TD&gt;&lt;TD&gt;TC&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;25342&lt;/TD&gt;&lt;TD&gt;AA&lt;/TD&gt;&lt;TD&gt;AG&lt;/TD&gt;&lt;TD&gt;AA&lt;/TD&gt;&lt;TD&gt;AG&lt;/TD&gt;&lt;TD&gt;TT&lt;/TD&gt;&lt;TD&gt;CC&lt;/TD&gt;&lt;TD&gt;--&lt;/TD&gt;&lt;TD&gt;GG&lt;/TD&gt;&lt;TD&gt;GG&lt;/TD&gt;&lt;TD&gt;GG&lt;/TD&gt;&lt;/TR&gt;&lt;/TBODY&gt;&lt;/TABLE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;For the larger data set, I am getting duplicate error by the ID SNPS that I am using. I know the SNP names are not duplicates, they are slightly different at the tale end of name but it seems SAS is assuming they are duplicates early. How can I make my script ignore duplicates? Using nodupkeys does not work as the SNP names are re-used for all SNPs. This is data from Illumina infinium array and their report lists genotypes per sample in long format, one at a time, and then start again.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Thanks,&lt;/P&gt;&lt;P&gt;Joy&lt;/P&gt;</description>
      <pubDate>Tue, 08 Dec 2020 01:15:38 GMT</pubDate>
      <guid>https://communities.sas.com/t5/Statistical-Procedures/Reshaping-long-data-to-wide/m-p/704318#M34049</guid>
      <dc:creator>joa2</dc:creator>
      <dc:date>2020-12-08T01:15:38Z</dc:date>
    </item>
    <item>
      <title>Re: Reshaping long data to wide</title>
      <link>https://communities.sas.com/t5/Statistical-Procedures/Reshaping-long-data-to-wide/m-p/704323#M34051</link>
      <description>&lt;BLOCKQUOTE&gt;&lt;HR /&gt;&lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/358014"&gt;@joa2&lt;/a&gt;&amp;nbsp;wrote:&lt;BR /&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;... stuff deleted ...&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;For the larger data set, I am getting duplicate error by the ID SNPS that I am using. I know the SNP names are not duplicates, they are slightly different at the tale end of name but it seems SAS is assuming they are duplicates early. How can I make my script ignore duplicates? Using nodupkeys does not work as the SNP names are re-used for all SNPs. This is data from Illumina infinium array and their report lists genotypes per sample in long format, one at a time, and then start again.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Thanks,&lt;/P&gt;
&lt;P&gt;Joy&lt;/P&gt;
&lt;HR /&gt;&lt;/BLOCKQUOTE&gt;
&lt;P&gt;You "know SNP names are not duplicates", but apparently SAS doesn't agree (you haven't shown your log, so I am assuming that is what you are describing).&amp;nbsp; So find the duplicates that sas sees.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc sort data=MAlong  out=non_unique nouniquekey;
  by id snps;
run;
&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;Every obs in non_unique is a duplicate ID/SNPS dyad.&amp;nbsp; You can examine them and decide what to do with them to prepare for proc transpose.&amp;nbsp;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 08 Dec 2020 01:46:48 GMT</pubDate>
      <guid>https://communities.sas.com/t5/Statistical-Procedures/Reshaping-long-data-to-wide/m-p/704323#M34051</guid>
      <dc:creator>mkeintz</dc:creator>
      <dc:date>2020-12-08T01:46:48Z</dc:date>
    </item>
    <item>
      <title>Re: Reshaping long data to wide</title>
      <link>https://communities.sas.com/t5/Statistical-Procedures/Reshaping-long-data-to-wide/m-p/704324#M34052</link>
      <description>&lt;P&gt;This is probably because the length of the SNPs variable is 8. Try increasing it. For example, these statements without the LENGTH statement produces an error, but is ok with the LENGTH statement.&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data a; 
length SNPs $ 200;
input ID	SNPs $	Genotype $;
datalines;
23456	rs1234aaaa	CC
23456	rs1234aaab	CC
23456	rs1236	TT
23456	rs1237	AA
23456	rs1238	TT
23456	rs1239	GG
23456	rs1240	GG
23456	rs1241	TT
23456	rs1242	CC
23456	rs1243	AA
17235	rs1234	TT
17235	rs1235	GG
17235	rs1236	TT
17235	rs1237	CC
17235	rs1238	AA
17235	rs1239	AA
17235	rs1240	AG
17235	rs1241	GG
17235	rs1242	GG
17235	rs1243	TC
25342	rs1234	AA
25342	rs1235	AG
;
PROC TRANSPOSE data = a out= b ;
by ID notsorted;
var Genotype ;
ID SNPs;
run;

 &lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Tue, 08 Dec 2020 02:02:50 GMT</pubDate>
      <guid>https://communities.sas.com/t5/Statistical-Procedures/Reshaping-long-data-to-wide/m-p/704324#M34052</guid>
      <dc:creator>StatDave</dc:creator>
      <dc:date>2020-12-08T02:02:50Z</dc:date>
    </item>
    <item>
      <title>Re: Reshaping long data to wide</title>
      <link>https://communities.sas.com/t5/Statistical-Procedures/Reshaping-long-data-to-wide/m-p/704325#M34053</link>
      <description>The data set is 650K by 1265 SNPs so it is hard to look at all duplicates.&lt;BR /&gt;The log file only me tion a few before stopping. What will be the best way&lt;BR /&gt;to solve this issue?&lt;BR /&gt;</description>
      <pubDate>Tue, 08 Dec 2020 02:21:07 GMT</pubDate>
      <guid>https://communities.sas.com/t5/Statistical-Procedures/Reshaping-long-data-to-wide/m-p/704325#M34053</guid>
      <dc:creator>joa2</dc:creator>
      <dc:date>2020-12-08T02:21:07Z</dc:date>
    </item>
    <item>
      <title>Re: Reshaping long data to wide</title>
      <link>https://communities.sas.com/t5/Statistical-Procedures/Reshaping-long-data-to-wide/m-p/704326#M34054</link>
      <description>The SNP variable length is 200, is there a way to add length to the proc&lt;BR /&gt;transpose statement? The actual data set is relatively large, 650K SNPs for&lt;BR /&gt;1265 samples.&lt;BR /&gt;</description>
      <pubDate>Tue, 08 Dec 2020 02:39:07 GMT</pubDate>
      <guid>https://communities.sas.com/t5/Statistical-Procedures/Reshaping-long-data-to-wide/m-p/704326#M34054</guid>
      <dc:creator>joa2</dc:creator>
      <dc:date>2020-12-08T02:39:07Z</dc:date>
    </item>
    <item>
      <title>Re: Reshaping long data to wide</title>
      <link>https://communities.sas.com/t5/Statistical-Procedures/Reshaping-long-data-to-wide/m-p/704338#M34055</link>
      <description>&lt;P&gt;Then you should recode the SNPs character variable to an integer to generate a 1 to 1 mapping.&amp;nbsp; Use that code as the ID variable, and make the original SNPs as the IDLABEL:&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data MAlong ;
  input ID	SNPs :$200.	Genotype :$2. ;
datalines;
23456	rs1234	CC
23456	rs1235	CC
23456	rs1236	TT
23456	rs1237	AA
23456	rs1238	TT
23456	rs1239	GG
23456	rs1240	GG
23456	rs1241	TT
23456	rs1242	CC
23456	rs1243	AA
17235	rs1234	TT
17235	rs1235	GG
17235	rs1236	TT
17235	rs1237	CC
17235	rs1238	AA
17235	rs1239	AA
17235	rs1240	AG
17235	rs1241	GG
17235	rs1242	GG
17235	rs1243	TC
25342	rs1234	AA
25342	rs1235	AG
25342   rs3456789_123456789_123456789_12A   XA     33 characters for SNPs
25342   rs3456789_123456789_123456789_12B   XB     33 characters for SNPs
run;

proc sort data=malong (keep=snps) out=snp_list nodupkey;
  by snps;
run;

data pre_transpose (drop=_:) / view=pre_transpose;
  if _n_=1 then do;
    if 0 then set snp_list;
    length snp_code $10;
    declare hash h ();
      h.definekey('snps');
      h.definedata('snp_code');
      h.definedone();
    do until (end_of_snps);
      set snp_list end=end_of_snps;
      _s+1;
      snp_code=catx('_','SCode',put(_s,z4.));
      h.add();
    end;
  end;
  set malong;
  h.find();
run;

proc transpose data=pre_transpose out=MAWide (drop=_name_);
  by ID notsorted;
  var Genotype ;
  id snp_code;
  IDLABEL SNPs;
run;
&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;The reason for the sort is so that the order of the columns represent the snps values in alphabetical order.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 08 Dec 2020 04:23:52 GMT</pubDate>
      <guid>https://communities.sas.com/t5/Statistical-Procedures/Reshaping-long-data-to-wide/m-p/704338#M34055</guid>
      <dc:creator>mkeintz</dc:creator>
      <dc:date>2020-12-08T04:23:52Z</dc:date>
    </item>
    <item>
      <title>Re: Reshaping long data to wide</title>
      <link>https://communities.sas.com/t5/Statistical-Procedures/Reshaping-long-data-to-wide/m-p/704375#M34056</link>
      <description>&lt;P&gt;If SNP values are longer than 32 characters, and identical in the first 32, then that is where your duplicates happen. Keep in mind that SAS variable names are limited to 32 characters.&lt;/P&gt;</description>
      <pubDate>Tue, 08 Dec 2020 08:46:47 GMT</pubDate>
      <guid>https://communities.sas.com/t5/Statistical-Procedures/Reshaping-long-data-to-wide/m-p/704375#M34056</guid>
      <dc:creator>Kurt_Bremser</dc:creator>
      <dc:date>2020-12-08T08:46:47Z</dc:date>
    </item>
  </channel>
</rss>

